summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@spawned.biz>2025-09-27 20:11:32 +0200
committerJoel Klinghed <the_jk@spawned.biz>2025-09-28 22:48:24 +0200
commitc1ae5d53fb0fa7ceb9d6fc7a60c87df958ce37fe (patch)
treef028a04619aa1b69f8b0aa72a5154f6ba1c09775
parent2f13baa843bd1fb5db6630a2823681ffaff9fb11 (diff)
WIPWIP
-rw-r--r--data/java-8/tokens.grammar385
-rw-r--r--meson.build119
-rw-r--r--src/errors.cc116
-rw-r--r--src/errors.hh64
-rw-r--r--src/gen_tokens.cc1019
-rw-r--r--src/gen_ugc.cc3
-rw-r--r--src/grammar.cc268
-rw-r--r--src/grammar.hh80
-rw-r--r--src/java_tokens.cc374
-rw-r--r--src/java_tokens.hh92
-rw-r--r--src/java_version.hh16
-rw-r--r--src/location.cc12
-rw-r--r--src/location.hh31
-rw-r--r--src/prefix_tree.cc112
-rw-r--r--src/str.cc19
-rw-r--r--src/str.hh3
-rw-r--r--src/ugc.hh2
-rw-r--r--test/java_tokens.cc551
-rw-r--r--test/java_uescape.cc9
-rw-r--r--test/prefix_tree.cc26
-rw-r--r--test/u.cc2
21 files changed, 3285 insertions, 18 deletions
diff --git a/data/java-8/tokens.grammar b/data/java-8/tokens.grammar
new file mode 100644
index 0000000..3521ac0
--- /dev/null
+++ b/data/java-8/tokens.grammar
@@ -0,0 +1,385 @@
+InputElement:
+ WhiteSpace
+ Comment
+ Token
+
+Token:
+ Identifier
+ Keyword
+ Literal
+ Separator
+ Operator
+
+Comment:
+ TraditionalComment
+ EndOfLineComment
+
+TraditionalComment:
+ / * CommentTail
+
+CommentTail:
+ * CommentTailStar
+ NotStar CommentTail
+
+CommentTailStar:
+ /
+ * CommentTailStar
+ NotStarNotSlash CommentTail
+
+NotStar:
+ InputCharacter but not *
+ LineTerminator
+
+NotStarNotSlash:
+ InputCharacter but not * or /
+ LineTerminator
+
+EndOfLineComment:
+ / / {InputCharacter}
+
+Identifier:
+ IdentifierChars but not Keyword or BooleanLiteral or NullLiteral
+
+IdentifierChars:
+ JavaLetter {JavaLetterOrDigit}
+
+Keyword:
+ abstract
+ continue
+ for
+ new
+ switch
+ assert
+ default
+ if
+ package
+ synchronized
+ boolean
+ do
+ goto
+ private
+ this
+ break
+ double
+ implements
+ protected
+ throw
+ byte
+ else
+ import
+ public
+ throws
+ case
+ enum
+ instanceof
+ return
+ transient
+ catch
+ extends
+ int
+ short
+ try
+ char
+ final
+ interface
+ static
+ void
+ class
+ finally
+ long
+ strictfp
+ volatile
+ const
+ float
+ native
+ super
+ while
+
+Literal:
+ IntegerLiteral
+ FloatingPointLiteral
+ BooleanLiteral
+ CharacterLiteral
+ StringLiteral
+ NullLiteral
+
+IntegerLiteral:
+ DecimalIntegerLiteral
+ HexIntegerLiteral
+ OctalIntegerLiteral
+ BinaryIntegerLiteral
+
+DecimalIntegerLiteral:
+ DecimalNumeral [IntegerTypeSuffix]
+
+HexIntegerLiteral:
+ HexNumeral [IntegerTypeSuffix]
+
+OctalIntegerLiteral:
+ OctalNumeral [IntegerTypeSuffix]
+
+BinaryIntegerLiteral:
+ BinaryNumeral [IntegerTypeSuffix]
+
+IntegerTypeSuffix:
+ l
+ L
+
+DecimalNumeral:
+ 0
+ NonZeroDigit [Digits]
+ NonZeroDigit Underscores Digits
+
+NonZeroDigit:
+ 1
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+
+Digits:
+ Digit
+ Digit [DigitsAndUnderscores] Digit
+
+Digit:
+ 0
+ NonZeroDigit
+
+DigitsAndUnderscores:
+ DigitOrUnderscore {DigitOrUnderscore}
+
+DigitOrUnderscore:
+ Digit
+ _
+
+Underscores:
+ _ {_}
+
+HexNumeral:
+ 0 x HexDigits
+ 0 X HexDigits
+
+HexDigits:
+ HexDigit
+ HexDigit [HexDigitsAndUnderscores] HexDigit
+
+HexDigit:
+ 0
+ 1
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+ a
+ b
+ c
+ d
+ e
+ f
+ A
+ B
+ C
+ D
+ E
+ F
+
+HexDigitsAndUnderscores:
+ HexDigitOrUnderscore {HexDigitOrUnderscore}
+
+HexDigitOrUnderscore:
+ HexDigit
+ _
+
+OctalNumeral:
+ 0 OctalDigits
+ 0 Underscores OctalDigits
+
+OctalDigits:
+ OctalDigit
+ OctalDigit [OctalDigitsAndUnderscores] OctalDigit
+
+OctalDigit:
+ 0
+ 1
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+
+OctalDigitsAndUnderscores:
+ OctalDigitOrUnderscore {OctalDigitOrUnderscore}
+
+OctalDigitOrUnderscore:
+ OctalDigit
+ _
+
+BinaryNumeral:
+ 0 b BinaryDigits
+ 0 B BinaryDigits
+
+BinaryDigits:
+ BinaryDigit
+ BinaryDigit [BinaryDigitsAndUnderscores] BinaryDigit
+
+BinaryDigit:
+ 0
+ 1
+
+BinaryDigitsAndUnderscores:
+ BinaryDigitOrUnderscore {BinaryDigitOrUnderscore}
+
+BinaryDigitOrUnderscore:
+ BinaryDigit
+ _
+
+FloatingPointLiteral:
+ DecimalFloatingPointLiteral
+ HexadecimalFloatingPointLiteral
+
+DecimalFloatingPointLiteral:
+ Digits . [Digits] [ExponentPart] [FloatTypeSuffix]
+ . Digits [ExponentPart] [FloatTypeSuffix]
+ Digits ExponentPart [FloatTypeSuffix]
+ Digits [ExponentPart] FloatTypeSuffix
+
+ExponentPart:
+ ExponentIndicator SignedInteger
+
+ExponentIndicator:
+ e
+ E
+
+SignedInteger:
+ [Sign] Digits
+
+Sign:
+ +
+ -
+
+FloatTypeSuffix:
+ f
+ F
+ d
+ D
+
+HexadecimalFloatingPointLiteral:
+ HexSignificand BinaryExponent [FloatTypeSuffix]
+
+HexSignificand:
+ HexNumeral [.]
+ 0 x [HexDigits] . HexDigits
+ 0 X [HexDigits] . HexDigits
+
+BinaryExponent:
+ BinaryExponentIndicator SignedInteger
+
+BinaryExponentIndicator:
+ p
+ P
+
+BooleanLiteral:
+ true
+ false
+
+CharacterLiteral:
+ ' SingleCharacter '
+ ' EscapeSequence '
+
+SingleCharacter:
+ InputCharacter but not ' or \
+
+StringLiteral:
+ " {StringCharacter} "
+
+StringCharacter:
+ InputCharacter but not " or \
+ EscapeSequence
+
+EscapeSequence:
+ \ b
+ \ t
+ \ n
+ \ f
+ \ r
+ \ "
+ \ '
+ \ \
+ OctalEscape
+
+OctalEscape:
+ \ OctalDigit
+ \ OctalDigit OctalDigit
+ \ ZeroToThree OctalDigit OctalDigit
+
+ZeroToThree:
+ 0
+ 1
+ 2
+ 3
+
+NullLiteral:
+ null
+
+Separator:
+ (
+ )
+ {
+ }
+ [
+ ]
+ ;
+ ,
+ .
+ ...
+ @
+ ::
+
+Operator:
+ =
+ >
+ <
+ !
+ ~
+ ?
+ :
+ ->
+ ==
+ >=
+ <=
+ !=
+ &&
+ ||
+ ++
+ --
+ +
+ -
+ *
+ /
+ &
+ |
+ ^
+ %
+ <<
+ >>
+ >>>
+ +=
+ -=
+ *=
+ /=
+ &=
+ |=
+ ^=
+ %=
+ <<=
+ >>=
+ >>>=
diff --git a/meson.build b/meson.build
index 1607088..d1f6908 100644
--- a/meson.build
+++ b/meson.build
@@ -12,6 +12,23 @@ project(
],
)
+cpp_flags = []
+cpp_optional_flags = []
+if get_option('buildtype') == 'release'
+ # If asserts are disabled parameters and variables used for only that
+ # end up causing warnings
+ cpp_optional_flags += ['-Wno-unused-parameter', '-Wno-unused-variable',
+ '-Wno-unused-but-set-variable']
+ cpp_flags += '-DNDEBUG'
+endif
+cpp = meson.get_compiler('cpp')
+foreach flag : cpp_optional_flags
+ if cpp.has_argument(flag)
+ cpp_flags += flag
+ endif
+endforeach
+add_project_arguments([cpp_flags], language: 'cpp')
+
conf_data = configuration_data()
conf_data.set('version', meson.project_version())
configure_file(input: 'src/config.h.in',
@@ -191,6 +208,34 @@ java_uescape_dep = declare_dependency(
dependencies: [buffer_dep, uio_dep],
)
+src_lib = library(
+ 'src',
+ sources: [
+ 'src/errors.cc',
+ 'src/errors.hh',
+ 'src/location.cc',
+ 'src/location.hh',
+ ],
+ include_directories: inc,
+)
+src_dep = declare_dependency(
+ link_with: src_lib,
+)
+
+grammar_lib = library(
+ 'grammar',
+ sources: [
+ 'src/grammar.cc',
+ 'src/grammar.hh',
+ ],
+ include_directories: inc,
+ dependencies: [io_dep, src_dep, str_dep],
+)
+grammar_dep = declare_dependency(
+ link_with: grammar_lib,
+ dependencies: [io_dep, src_dep, str_dep],
+)
+
prefix_tree_lib = library(
'prefix_tree',
sources: [
@@ -201,6 +246,69 @@ prefix_tree_lib = library(
)
prefix_tree_dep = declare_dependency(link_with: prefix_tree_lib)
+gen_tokens = executable(
+ 'gen_tokens',
+ sources: [
+ 'src/gen_tokens.cc',
+ ],
+ include_directories: inc,
+ install : false,
+ dependencies : [
+ args_dep,
+ grammar_dep,
+ prefix_tree_dep,
+ src_dep,
+ ],
+)
+
+java_versions = [
+ 'java-8',
+]
+
+java_unicode_versions = {
+ 'java-8': '6.2.0',
+}
+
+java_tokens_sources = []
+foreach java_version : java_versions
+ java_unicode_version = 'u' + java_unicode_versions[java_version].replace('.', '_')
+ java_tokens_sources += custom_target(
+ 'gen-tokens-' + java_version,
+ input: ['data/' + java_version + '/tokens.grammar'],
+ output: ['java_tokens_' + java_version + '.hh',
+ 'java_tokens_' + java_version + '.cc'],
+ command : [gen_tokens, '--namespace',
+ java_version.replace('.', '_').replace('-', '_'),
+ '--unicode', java_unicode_version,
+ '@INPUT@', '@OUTPUT@'])
+endforeach
+
+java_lib = library(
+ 'java',
+ sources: [
+ 'src/java_tokens.hh',
+ 'src/java_tokens.cc',
+ 'src/java_version.hh',
+ java_tokens_sources,
+ ],
+ include_directories: inc,
+ dependencies: [
+ java_uescape_dep,
+ prefix_tree_dep,
+ src_dep,
+ str_dep,
+ uline_dep,
+ ]
+)
+java_dep = declare_dependency(link_with: java_lib,
+ dependencies: [
+ java_uescape_dep,
+ prefix_tree_dep,
+ src_dep,
+ str_dep,
+ uline_dep,
+ ])
+
jkc = executable(
'jkc',
sources: [
@@ -351,12 +459,23 @@ test('java_uescape', executable(
],
))
+test('java_tokens', executable(
+ 'test_java_tokens',
+ sources: ['test/java_tokens.cc'],
+ include_directories: inc,
+ dependencies: [
+ java_dep,
+ test_dependencies,
+ ],
+))
+
test('prefix_tree', executable(
'test_prefix_tree',
sources: ['test/prefix_tree.cc'],
include_directories: inc,
dependencies: [
prefix_tree_dep,
+ str_dep,
test_dependencies,
],
))
diff --git a/src/errors.cc b/src/errors.cc
new file mode 100644
index 0000000..bf92c3b
--- /dev/null
+++ b/src/errors.cc
@@ -0,0 +1,116 @@
+#include "errors.hh"
+
+#include "location.hh"
+
+#include <cstdint>
+#include <format>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <string_view>
+#include <utility>
+
+namespace src {
+
+namespace {
+
+class FileErrors : public Errors {
+ public:
+ FileErrors(std::string filename, std::shared_ptr<ErrorsOutput> output)
+ : filename_(std::move(filename)), output_(std::move(output)) {}
+
+ void err(Location loc, std::string_view msg) override {
+ ++errors_;
+ output_->println(std::format("{}:{}:{}: Error {}", filename_, loc.line,
+ loc.column, msg));
+ }
+
+ void warn(Location loc, std::string_view msg) override {
+ ++warnings_;
+ output_->println(std::format("{}:{}:{}: Warning {}", filename_, loc.line,
+ loc.column, msg));
+ }
+
+#if !defined(NDEBUG)
+ void dbg(Location loc, std::string_view msg) override {
+ output_->println(std::format("{}:{}:{}: Debug {}", filename_, loc.line,
+ loc.column, msg));
+ }
+#endif
+
+ [[nodiscard]]
+ uint64_t errors() const override {
+ return errors_;
+ }
+
+ [[nodiscard]]
+ uint64_t warnings() const override {
+ return warnings_;
+ }
+
+ private:
+ std::string const filename_;
+ std::shared_ptr<ErrorsOutput> output_;
+ uint64_t errors_{0};
+ uint64_t warnings_{0};
+};
+
+class IgnoreErrors : public Errors {
+ public:
+ IgnoreErrors() = default;
+
+ void err(Location /* loc */, std::string_view /* msg */) override {}
+ void warn(Location /* loc */, std::string_view /* msg */) override {}
+
+#if !defined(NDEBUG)
+ void dbg(Location /* loc */, std::string_view /* msg */) override {}
+#endif
+
+ [[nodiscard]]
+ uint64_t errors() const override {
+ return 0;
+ }
+
+ [[nodiscard]]
+ uint64_t warnings() const override {
+ return 0;
+ }
+};
+
+class OutputStreamErrorsOutput : public ErrorsOutput {
+ public:
+ explicit OutputStreamErrorsOutput(std::ostream& out) : out_(out) {}
+
+ void println(std::string_view line) override { out_ << line << '\n'; }
+
+ private:
+ std::ostream& out_;
+};
+
+} // namespace
+
+[[nodiscard]]
+std::unique_ptr<Errors> file_errors(std::string filename,
+ std::shared_ptr<ErrorsOutput> output) {
+ if (!output) {
+ static std::shared_ptr<ErrorsOutput> g_stderr_output;
+ // TODO: Make thread-safe when needed
+ if (!g_stderr_output)
+ g_stderr_output = std::make_shared<OutputStreamErrorsOutput>(std::cerr);
+ output = g_stderr_output;
+ }
+ return std::make_unique<FileErrors>(std::move(filename), std::move(output));
+}
+
+[[nodiscard]]
+std::unique_ptr<Errors> ignore_errors() {
+ return std::make_unique<IgnoreErrors>();
+}
+
+[[nodiscard]]
+std::unique_ptr<ErrorsOutput> errors_output_ios(std::ostream& out) {
+ return std::make_unique<OutputStreamErrorsOutput>(out);
+}
+
+} // namespace src
diff --git a/src/errors.hh b/src/errors.hh
new file mode 100644
index 0000000..d8b3d60
--- /dev/null
+++ b/src/errors.hh
@@ -0,0 +1,64 @@
+#ifndef ERRORS_HH
+#define ERRORS_HH
+
+#include "location.hh"
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <string_view>
+
+namespace src {
+
+class Errors {
+ public:
+ virtual ~Errors() = default;
+
+ virtual void err(Location loc, std::string_view msg) = 0;
+
+ virtual void warn(Location loc, std::string_view msg) = 0;
+
+#if !defined(NDEBUG)
+ virtual void dbg(Location loc, std::string_view msg) = 0;
+#else
+ void dbg(Location, std::string_view) {}
+#endif
+
+ [[nodiscard]]
+ virtual uint64_t errors() const = 0;
+ [[nodiscard]]
+ virtual uint64_t warnings() const = 0;
+
+ protected:
+ Errors() = default;
+
+ Errors(Errors const&) = delete;
+ Errors& operator=(Errors const&) = delete;
+};
+
+class ErrorsOutput {
+ public:
+ virtual ~ErrorsOutput() = default;
+
+ virtual void println(std::string_view line) = 0;
+
+ protected:
+ ErrorsOutput() = default;
+
+ ErrorsOutput(ErrorsOutput const&) = delete;
+ ErrorsOutput& operator=(ErrorsOutput const&) = delete;
+};
+
+[[nodiscard]]
+std::unique_ptr<Errors> file_errors(
+ std::string filename, std::shared_ptr<ErrorsOutput> output = nullptr);
+
+[[nodiscard]]
+std::unique_ptr<Errors> ignore_errors();
+
+[[nodiscard]]
+std::unique_ptr<ErrorsOutput> errors_output_ios(std::ostream& out);
+
+} // namespace src
+
+#endif // ERRORS_HH
diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc
new file mode 100644
index 0000000..2442c4f
--- /dev/null
+++ b/src/gen_tokens.cc
@@ -0,0 +1,1019 @@
+#include "args.hh"
+#include "errors.hh"
+#include "grammar.hh"
+#include "io.hh"
+#include "prefix_tree.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <charconv>
+#include <cstddef>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <set>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace {
+
+enum class CharacterClass : uint8_t {
+ kWhiteSpace = 0,
+ kLineTerminator = 1,
+ kInputCharacter = 2,
+ kJavaLetter = 3,
+ kJavaLetterOrDigit = 4,
+};
+
+std::vector<std::string> const kCharacterClassNames(
+ {"WhiteSpace", "LineTerminator", "InputCharacter", "JavaLetter",
+ "JavaLetterOrDigit"});
+
+enum class ReturnType : uint8_t {
+ kTokenAndSize,
+ kInternalAndSize,
+ kSize,
+};
+
+std::string make_define(std::string_view filename) {
+ std::string ret;
+ ret.reserve(filename.size());
+ for (char c : filename) {
+ if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_') {
+ ret.push_back(c);
+ } else if (c >= 'a' && c <= 'z') {
+ ret.push_back(static_cast<char>(c & ~0x20));
+ } else {
+ ret.push_back('_');
+ }
+ }
+ return ret;
+}
+
+class Generator {
+ public:
+ bool generate(std::string_view header_name, std::string_view source_name,
+ std::string const& ns, std::string const& unicode_version,
+ grammar::Grammar& grammar);
+
+ private:
+ void find_specific_elements(grammar::Element const& root);
+ void find_all_elements(grammar::Element const& root);
+
+ void check_need_last(grammar::Element const& element);
+ bool find_report_last(grammar::Element const& root,
+ grammar::Element const& match);
+
+ [[nodiscard]]
+ ReturnType get_return_type(grammar::Element const& element) const;
+ [[nodiscard]]
+ ReturnType get_return_type(uint8_t character_class) const;
+
+ void write_matcher(std::ostream& out, grammar::Symbol const& symbol,
+ ReturnType return_type, std::string_view str_arg);
+ bool write_matcher(std::ostream& out, grammar::Definition const& definition,
+ ReturnType return_type, std::string_view indent);
+ bool write_matcher(std::ostream& out, grammar::Element const& element,
+ ReturnType return_type);
+
+ std::set<std::string_view> above_specific_tokens_;
+ std::set<std::string_view> specific_tokens_;
+ std::set<grammar::Element const*> all_elements_;
+ std::set<std::string_view> copy_last_;
+ std::set<std::string_view> report_last_;
+};
+
+// Find the Elements that has at least one terminal or character class as symbol
+// These will be the different tokens the tokenizer can return
+void Generator::find_specific_elements(grammar::Element const& root) {
+ if (std::ranges::any_of(root.definitions, [](auto const& definition) {
+ return definition.symbols.size() > 1 ||
+ definition.symbols[0].type == grammar::Symbol::Type::kTerminal;
+ })) {
+ specific_tokens_.insert(root.name);
+ return;
+ }
+
+ above_specific_tokens_.insert(root.name);
+
+ for (auto const& definition : root.definitions) {
+ for (auto const& symbol : definition.symbols) {
+ switch (symbol.type) {
+ case grammar::Symbol::Type::kNonTerminal:
+ find_specific_elements(*symbol.element);
+ break;
+ case grammar::Symbol::Type::kCharacterClass:
+ specific_tokens_.insert(kCharacterClassNames[symbol.char_class]);
+ break;
+ case grammar::Symbol::Type::kTerminal:
+ std::unreachable();
+ }
+ }
+ }
+}
+
+// Find elements that have definitions that has ZeroOrMore matches with a final condition
+void Generator::check_need_last(grammar::Element const& element) {
+ for (auto const& definition : element.definitions) {
+ if (definition.symbols.size() < 2)
+ continue;
+ if (definition.symbols[definition.symbols.size() - 1].optional ==
+ grammar::Symbol::Optional::kRequired &&
+ definition.symbols[definition.symbols.size() - 1].type ==
+ grammar::Symbol::Type::kNonTerminal &&
+ (definition.symbols[definition.symbols.size() - 2].optional ==
+ grammar::Symbol::Optional::kZeroOrOne ||
+ definition.symbols[definition.symbols.size() - 2].optional ==
+ grammar::Symbol::Optional::kZeroOrMore) &&
+ definition.symbols[definition.symbols.size() - 2].type ==
+ grammar::Symbol::Type::kNonTerminal) {
+ if (!copy_last_.contains(definition.symbols[definition.symbols.size() - 2]
+ .element->name)) {
+ find_report_last(
+ *definition.symbols[definition.symbols.size() - 2].element,
+ *definition.symbols[definition.symbols.size() - 1].element);
+ }
+ }
+ }
+}
+
+// Find element that has match as a single definition, if so, return true and insert into
+// report_last_.
+bool Generator::find_report_last(grammar::Element const& root,
+ grammar::Element const& match) {
+ if (std::ranges::any_of(root.definitions, [&match](auto const& definition) {
+ return definition.symbols.size() == 1 &&
+ definition.symbols[0].optional ==
+ grammar::Symbol::Optional::kRequired &&
+ definition.symbols[0].type ==
+ grammar::Symbol::Type::kNonTerminal &&
+ definition.symbols[0].element == &match;
+ })) {
+ report_last_.insert(root.name);
+ return true;
+ }
+
+ for (auto const& definition : root.definitions) {
+ for (auto const& symbol : definition.symbols) {
+ if (symbol.type == grammar::Symbol::Type::kNonTerminal) {
+ if (!copy_last_.contains(symbol.element->name)) {
+ if (find_report_last(*symbol.element, match)) {
+ copy_last_.insert(root.name);
+ return true;
+ }
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+ReturnType Generator::get_return_type(grammar::Element const& element) const {
+ if (above_specific_tokens_.contains(element.name) ||
+ specific_tokens_.contains(element.name))
+ return ReturnType::kTokenAndSize;
+ if (copy_last_.contains(element.name) || report_last_.contains(element.name))
+ return ReturnType::kInternalAndSize;
+ return ReturnType::kSize;
+}
+
+ReturnType Generator::get_return_type(uint8_t character_class) const {
+ auto const& name = kCharacterClassNames[character_class];
+ if (above_specific_tokens_.contains(name) || specific_tokens_.contains(name))
+ return ReturnType::kTokenAndSize;
+ return ReturnType::kSize;
+}
+
+void Generator::find_all_elements(grammar::Element const& root) {
+ auto pair = all_elements_.insert(&root);
+ if (!pair.second)
+ return;
+
+ for (auto const& definition : root.definitions) {
+ for (auto const& symbol : definition.symbols) {
+ switch (symbol.type) {
+ case grammar::Symbol::Type::kNonTerminal:
+ find_all_elements(*symbol.element);
+ break;
+ case grammar::Symbol::Type::kCharacterClass:
+ case grammar::Symbol::Type::kTerminal:
+ break;
+ }
+ }
+ }
+}
+
+void write_character_class_matchers(std::ostream& out,
+ std::string_view unicode_version) {
+ out << "[[nodiscard]]\n"
+ << "inline std::optional<size_t> matchLineTerminator"
+ << "(std::string_view str) {\n"
+ // Tokenizer reads one line at a time,
+ // so line terminator matches end of line.
+ << " return str.empty() ? std::make_optional<size_t>(0)"
+ << " : std::nullopt;\n"
+ << "}\n"
+ << "\n";
+
+ out << "[[nodiscard]]\n"
+ << "inline std::optional<size_t> matchInputCharacter"
+ << "(std::string_view str) {\n"
+ << " if (str.empty())\n"
+ << " return std::nullopt;\n"
+ // UnicodeInputCharacter but not CR or LF
+ << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+ << " auto* ptr = start;\n"
+ << " u8::skip(ptr, start + str.size());\n"
+ << " return ptr - start;\n"
+ << "}\n"
+ << "\n";
+
+ out << "[[nodiscard]]\n"
+ << "inline std::optional<std::pair<Token, size_t>> matchWhiteSpace"
+ << "(std::string_view str) {\n"
+ // LineTerminator
+ << " if (auto ret = matchLineTerminator(str); ret.has_value())\n"
+ << " return std::make_pair(Token::kWhiteSpace, ret.value());\n"
+ << " switch (str.front()) {\n"
+ // the ASCII SP character, also known as "space"
+ << " case ' ':\n"
+ // the ASCII HT character, also known as "horizontal tab"
+ << " case '\\t':\n"
+ // the ASCII FF character, also known as "form feed"
+ << " case '\\f':\n"
+ << " return std::make_pair(Token::kWhiteSpace, 1);\n"
+ << " default:\n"
+ << " return std::nullopt;\n"
+ << " }\n"
+ << "}\n"
+ << "\n";
+
+ out << "[[nodiscard]]\n"
+ << "inline std::optional<size_t> matchJavaLetter"
+ << "(std::string_view str) {\n"
+ << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+ << " auto* ptr = start;\n"
+ << " auto code = u8::read(ptr, ptr + str.size());\n"
+ << " if (!code.has_value())\n"
+ << " return std::nullopt;\n"
+ // any Unicode character that is a "Java letter"
+ // A "Java letter" is a character for which the method Character.isJavaIdentifierStart(int) returns true.
+ // A character may start a Java identifier if and only if one of the following conditions is true:
+ // isLetter(codePoint) returns true
+ // getType(codePoint) returns LETTER_NUMBER
+ // the referenced character is a currency symbol (such as '$')
+ // the referenced character is a connecting punctuation character (such as '_').
+ << " switch (u::lookup_gc(code.value(), u::Version::" << unicode_version
+ << ")) {\n"
+ << " case u::GeneralCategory::LETTER_UPPERCASE:\n"
+ << " case u::GeneralCategory::LETTER_LOWERCASE:\n"
+ << " case u::GeneralCategory::LETTER_TITLECASE:\n"
+ << " case u::GeneralCategory::LETTER_MODIFIER:\n"
+ << " case u::GeneralCategory::LETTER_OTHER:\n"
+ << " case u::GeneralCategory::NUMBER_LETTER:\n"
+ << " case u::GeneralCategory::SYMBOL_CURRENCY:\n"
+ << " case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n"
+ << " return ptr - start;\n"
+ << " default:\n"
+ << " return std::nullopt;\n"
+ << " }\n"
+ << "}\n"
+ << "\n";
+
+ out << "[[nodiscard]]\n"
+ << "inline std::optional<size_t> matchJavaLetterOrDigit"
+ << "(std::string_view str) {\n"
+ << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n"
+ << " auto* ptr = start;\n"
+ << " auto code = u8::read(ptr, ptr + str.size());\n"
+ << " if (!code.has_value())\n"
+ << " return std::nullopt;\n"
+ // any Unicode character that is a "Java letter-or-digit"
+ // A "Java letter-or-digit" is a character for which the method Character.isJavaIdentifierPart(int) returns true.
+ // A character may be part of a Java identifier if any of the following conditions are true:
+ // it is a letter
+ // it is a currency symbol (such as '$')
+ // it is a connecting punctuation character (such as '_')
+ // it is a digit
+ // it is a numeric letter (such as a Roman numeral character)
+ // it is a combining mark
+ // it is a non-spacing mark
+ // isIdentifierIgnorable returns true for the character
+ << " switch (u::lookup_gc(code.value(), u::Version::" << unicode_version
+ << ")) {\n"
+ << " case u::GeneralCategory::LETTER_UPPERCASE:\n"
+ << " case u::GeneralCategory::LETTER_LOWERCASE:\n"
+ << " case u::GeneralCategory::LETTER_TITLECASE:\n"
+ << " case u::GeneralCategory::LETTER_MODIFIER:\n"
+ << " case u::GeneralCategory::LETTER_OTHER:\n"
+ << " case u::GeneralCategory::SYMBOL_CURRENCY:\n"
+ << " case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n"
+ << " case u::GeneralCategory::NUMBER_DIGIT:\n"
+ << " case u::GeneralCategory::NUMBER_LETTER:\n"
+ << " case u::GeneralCategory::MARK_SPACING_COMBINING:\n"
+ << " case u::GeneralCategory::MARK_NONSPACING:\n"
+ << " case u::GeneralCategory::OTHER_FORMAT:\n"
+ << " return ptr - start;\n"
+ << " case u::GeneralCategory::OTHER_CONTROL:\n"
+ << " if ((/* code.value() >= 0 && */ code.value() <= 8) ||\n"
+ << " (code.value() >= 0xe && code.value() <= 0x1b) ||\n"
+ << " (code.value() >= 0x7f && code.value() <= 0x9f))\n"
+ << " return 1;\n"
+ << " break;\n"
+ << " default:\n"
+ << " break;\n"
+ << " }\n"
+ << " return std::nullopt;\n"
+ << "}\n"
+ << "\n";
+}
+
+std::ostream& quote(std::ostream& out, std::string_view in) {
+ out << '"';
+ bool avoid_digit = false;
+ for (auto c : in) {
+ if (c == '"' || c == '\\') {
+ out << '\\';
+ } else if (c < ' ' || (c & 0x80)) {
+ char tmp[4];
+ std::to_chars(tmp, tmp + sizeof(tmp), c & 0xff, 8).ptr[0] = 0;
+ out << "\\" << tmp;
+ avoid_digit = true;
+ continue;
+ } else if (avoid_digit) {
+ if (c >= '0' && c <= '7') {
+ out << "\" \"";
+ }
+ }
+ avoid_digit = false;
+ out << c;
+ }
+ out << '"';
+ return out;
+}
+
+void match_return_type(std::ostream& out, ReturnType in_return_type,
+ std::string_view in_name, ReturnType out_return_type) {
+ switch (out_return_type) {
+ case ReturnType::kTokenAndSize:
+ switch (in_return_type) {
+ case ReturnType::kTokenAndSize:
+ break;
+ case ReturnType::kInternalAndSize:
+ out << ".transform([](auto pair) { return std::make_pair(Token::k"
+ << in_name << ", pair.second); }";
+ break;
+ case ReturnType::kSize:
+ out << ".transform([](auto size) { return std::make_pair(Token::k"
+ << in_name << ", size); })";
+ break;
+ }
+ break;
+ case ReturnType::kInternalAndSize:
+ switch (in_return_type) {
+ case ReturnType::kTokenAndSize:
+ out << ".transform([](auto pair) { return std::make_pair(Internal::k"
+ << in_name << ", pair.second); }";
+ break;
+ case ReturnType::kInternalAndSize:
+ break;
+ case ReturnType::kSize:
+ if (in_name.empty()) {
+ out << ".transform([](auto size) { return "
+ "std::make_pair(Internal::UNDEFINED, size); })";
+ } else {
+ out << ".transform([](auto size) { return "
+ "std::make_pair(Internal::k"
+ << in_name << ", size); })";
+ }
+ break;
+ }
+ break;
+ case ReturnType::kSize:
+ if (in_return_type != ReturnType::kSize) {
+ out << ".transform([](auto pair) { return pair.second; })";
+ }
+ break;
+ }
+}
+
+void Generator::write_matcher(std::ostream& out, grammar::Symbol const& symbol,
+ ReturnType return_type,
+ std::string_view str_arg) {
+ std::string_view in_name;
+ ReturnType in_return_type;
+
+ switch (symbol.type) {
+ case grammar::Symbol::Type::kTerminal:
+ in_return_type = ReturnType::kSize;
+ out << "(" << str_arg << ".starts_with(";
+ quote(out, symbol.value);
+ out << ") ? std::make_optional<size_t>(" << symbol.value.size()
+ << ") : " << "std::nullopt)";
+ break;
+ case grammar::Symbol::Type::kNonTerminal:
+ out << "match" << symbol.element->name << "(" << str_arg << ")";
+ in_return_type = get_return_type(*symbol.element);
+ in_name = symbol.element->name;
+ break;
+ case grammar::Symbol::Type::kCharacterClass:
+ out << "match" << kCharacterClassNames[symbol.char_class] << "("
+ << str_arg << ")";
+ in_return_type = get_return_type(symbol.char_class);
+ in_name = kCharacterClassNames[symbol.char_class];
+ break;
+ }
+
+ match_return_type(out, in_return_type, in_name, return_type);
+}
+
+bool Generator::write_matcher(std::ostream& out,
+ grammar::Definition const& definition,
+ ReturnType return_type, std::string_view indent) {
+ if (definition.symbols.size() == 1 &&
+ definition.symbols[0].optional == grammar::Symbol::Optional::kRequired) {
+ out << indent << "return ";
+ write_matcher(out, definition.symbols[0], return_type, "str");
+ out << ";\n";
+ return true;
+ }
+
+ std::string_view size_suffix;
+ switch (return_type) {
+ case ReturnType::kTokenAndSize:
+ case ReturnType::kInternalAndSize:
+ size_suffix = "->second";
+ break;
+ case ReturnType::kSize:
+ size_suffix = ".value()";
+ break;
+ }
+
+ if (definition.symbols.size() > 1 &&
+ definition.symbols[0].optional == grammar::Symbol::Optional::kRequired &&
+ definition.symbols[1].optional == grammar::Symbol::Optional::kExcluded) {
+ bool first = true;
+ for (auto const& symbol : definition.symbols) {
+ if (first) {
+ out << indent << "auto first_ret = ";
+ write_matcher(out, symbol, return_type, "str");
+ out << ";\n"
+ << indent << "if (!first_ret.has_value())\n"
+ << indent << " return first_ret;\n"
+ << indent << "std::optional<size_t> ret;\n"
+ << indent << "auto tmp = str.substr(0, first_ret" << size_suffix
+ << ");\n";
+ first = false;
+ } else {
+ if (symbol.optional != grammar::Symbol::Optional::kExcluded) {
+ std::cerr << "Non-excluded after at least one excluded\n";
+ return false;
+ }
+ out << indent << "ret = ";
+ write_matcher(out, symbol, ReturnType::kSize, "tmp");
+ out << ";\n"
+ << indent << "if (ret.has_value() && ret.value() == tmp.size())\n"
+ << indent << " return std::nullopt;\n";
+ }
+ }
+ out << indent << "return first_ret;\n";
+ return true;
+ }
+
+ if (std::ranges::all_of(definition.symbols, [](auto const& symbol) {
+ return symbol.optional == grammar::Symbol::Optional::kRequired;
+ })) {
+ out << indent << "size_t tot = 0;\n";
+ bool first = true;
+ for (auto const& symbol : definition.symbols) {
+ std::string indent2(indent);
+ if (first) {
+ out << indent2 << "auto ret = ";
+ write_matcher(out, symbol, return_type, "str");
+ out << ";\n";
+ first = false;
+ } else {
+ out << indent2 << "ret = ";
+ write_matcher(out, symbol, return_type, "str.substr(tot)");
+ out << ";\n";
+ }
+ out << indent2 << "if (!ret.has_value())\n"
+ << indent2 << " return ret;\n";
+ out << indent2 << "tot += ret" << size_suffix << ";\n";
+ }
+ switch (return_type) {
+ case ReturnType::kInternalAndSize:
+ // Return last internal
+ out << indent << "return std::make_pair(ret->first, tot);\n";
+ break;
+ case ReturnType::kTokenAndSize:
+ std::cerr << "Unable to return token and size\n";
+ return false;
+ case ReturnType::kSize:
+ out << indent << "return tot;\n";
+ break;
+ }
+ return true;
+ }
+
+ out << indent << "size_t tot = 0;\n";
+ bool last_internal = false;
+ switch (return_type) {
+ case ReturnType::kInternalAndSize:
+ last_internal = true;
+ out << indent << "std::optional<Internal> last_internal;\n";
+ break;
+ case ReturnType::kTokenAndSize:
+ case ReturnType::kSize:
+ break;
+ }
+ bool at_least_one_required = false;
+ bool first = true;
+ bool first_internal = true;
+ bool next_internal = false;
+ for (size_t i = 0; i < definition.symbols.size(); ++i) {
+ auto const& symbol = definition.symbols[i];
+ std::string indent2(indent);
+ bool have_internal = next_internal;
+ next_internal = false;
+ ReturnType symbol_return_type = return_type;
+
+ if (symbol.optional != grammar::Symbol::Optional::kRequired &&
+ i + 1 < definition.symbols.size() &&
+ definition.symbols[i + 1].optional ==
+ grammar::Symbol::Optional::kRequired &&
+ definition.symbols[i + 1].type == grammar::Symbol::Type::kNonTerminal) {
+ symbol_return_type = ReturnType::kInternalAndSize;
+ next_internal = true;
+ }
+
+ switch (symbol_return_type) {
+ case ReturnType::kTokenAndSize:
+ case ReturnType::kInternalAndSize:
+ size_suffix = "->second";
+ break;
+ case ReturnType::kSize:
+ size_suffix = ".value()";
+ break;
+ }
+
+ switch (symbol.optional) {
+ case grammar::Symbol::Optional::kRequired:
+ at_least_one_required = true;
+ break;
+ case grammar::Symbol::Optional::kZeroOrOne:
+ break;
+ case grammar::Symbol::Optional::kZeroOrMore:
+ if (first) {
+ switch (symbol_return_type) {
+ case ReturnType::kTokenAndSize:
+ out << indent << "std::optional<std::pair<Token, size_t>> ret;\n";
+ break;
+ case ReturnType::kInternalAndSize:
+ out << indent
+ << "std::optional<std::pair<Internal, size_t>> ret;\n";
+ break;
+ case ReturnType::kSize:
+ out << indent << "std::optional<size_t> ret;\n";
+ break;
+ }
+ first = false;
+ }
+ out << indent << "while (true) {\n";
+ indent2 += " ";
+ break;
+ case grammar::Symbol::Optional::kExcluded:
+ std::cerr << "Excluded mixed with conditional\n";
+ return false;
+ }
+ if (symbol_return_type == return_type) {
+ if (first) {
+ out << indent2 << "auto ret = ";
+ write_matcher(out, symbol, symbol_return_type, "str");
+ first = false;
+ } else {
+ out << indent2 << "ret = ";
+ write_matcher(out, symbol, symbol_return_type, "str.substr(tot)");
+ }
+ out << ";\n";
+ } else {
+ if (first_internal) {
+ out << indent2 << "auto ret_internal = ";
+ write_matcher(out, symbol, symbol_return_type,
+ first ? "str" : "str.substr(tot)");
+ first_internal = false;
+ } else {
+ out << indent2 << "ret_internal = ";
+ write_matcher(out, symbol, symbol_return_type, "str.substr(tot)");
+ }
+ out << ";\n";
+ if (first) {
+ out << indent2 << "auto ret = ret_internal";
+ first = false;
+ } else {
+ out << indent2 << "ret = ret_internal";
+ }
+ match_return_type(out, symbol_return_type, "", return_type);
+ out << ";\n";
+ }
+ switch (symbol.optional) {
+ case grammar::Symbol::Optional::kRequired:
+ out << indent2 << "if (!ret.has_value()) {\n";
+ if (have_internal &&
+ symbol.type == grammar::Symbol::Type::kNonTerminal) {
+ out << indent2
+ << " if (!ret_internal.has_value() || ret_internal->first != "
+ "Internal::k"
+ << symbol.element->name << ")\n"
+ << indent2 << " return ret;\n";
+ } else {
+ out << indent2 << " return ret;\n";
+ }
+ out << indent2 << "} else {\n"
+ << indent2 << " tot += ret" << size_suffix << ";\n";
+ if (last_internal)
+ out << indent2 << " last_internal = ret->first;\n";
+ out << indent2 << "}\n";
+ break;
+ case grammar::Symbol::Optional::kZeroOrOne:
+ if (symbol_return_type == ReturnType::kTokenAndSize) {
+ out << indent2 << "tot += ret.has_value() ? ret->second : 0;\n";
+ } else {
+ out << indent2 << "tot += ret.value_or(0);\n";
+ }
+ if (last_internal)
+ out << indent2 << "if (ret.has_value())\n"
+ << indent2 << " last_internal = ret->first;\n";
+ break;
+ case grammar::Symbol::Optional::kZeroOrMore:
+ out << indent2 << "if (!ret.has_value())\n"
+ << indent2 << " break;\n"
+ << indent2 << "tot += ret" << size_suffix << ";\n";
+ if (last_internal)
+ out << indent2 << "last_internal = ret->first;\n";
+ out << indent << "}\n";
+ break;
+ case grammar::Symbol::Optional::kExcluded:
+ assert(false);
+ break;
+ }
+ }
+ switch (return_type) {
+ case ReturnType::kInternalAndSize:
+ // Return last internal
+ if (at_least_one_required) {
+ out << indent << "return std::make_pair(last_internal.value(), tot);\n";
+ } else {
+ out << indent << "if (last_internal.has_value())\n"
+ << indent
+ << " return std::make_pair(last_internal.value(), tot);\n"
+ << indent << "return std::make_pair(Internal::UNDEFINED, tot);\n";
+ }
+ break;
+ case ReturnType::kTokenAndSize:
+ std::cerr << "Unable to return token and size\n";
+ return false;
+ case ReturnType::kSize:
+ out << indent << "return tot;\n";
+ break;
+ }
+ return true;
+}
+
+void declare_matcher(std::ostream& out, grammar::Element const& element,
+ ReturnType return_type) {
+ switch (return_type) {
+ case ReturnType::kSize:
+ out << "[[nodiscard]]\n"
+ << "inline std::optional<size_t> match" << element.name
+ << "(std::string_view str);\n";
+ break;
+ case ReturnType::kTokenAndSize:
+ out << "[[nodiscard]]\n"
+ << "inline std::optional<std::pair<Token, size_t>> match"
+ << element.name << "(std::string_view str);\n";
+ break;
+ case ReturnType::kInternalAndSize:
+ out << "[[nodiscard]]\n"
+ << "inline std::optional<std::pair<Internal, size_t>> match"
+ << element.name << "(std::string_view str);\n";
+ break;
+ }
+}
+
+bool Generator::write_matcher(std::ostream& out,
+ grammar::Element const& element,
+ ReturnType return_type) {
+ ReturnType sub_return_type = return_type;
+ bool make_token = false;
+
+ switch (return_type) {
+ case ReturnType::kSize:
+ out << "[[nodiscard]]\n"
+ << "inline std::optional<size_t> match" << element.name
+ << "(std::string_view str) {\n";
+ break;
+ case ReturnType::kTokenAndSize:
+ out << "[[nodiscard]]\n"
+ << "inline std::optional<std::pair<Token, size_t>> match"
+ << element.name << "(std::string_view str) {\n";
+
+ if (specific_tokens_.contains(element.name)) {
+ sub_return_type = ReturnType::kSize;
+ make_token = true;
+ }
+ break;
+ case ReturnType::kInternalAndSize:
+ out << "[[nodiscard]]\n"
+ << "inline std::optional<std::pair<Internal, size_t>> match"
+ << element.name << "(std::string_view str) {\n";
+ break;
+ }
+
+ if (element.definitions.size() == 1) {
+ if (make_token) {
+ out << " auto ret = [str]() -> std::optional<size_t> {\n";
+ if (!write_matcher(out, element.definitions[0], sub_return_type,
+ " ")) {
+ std::cerr << "Error in " << element.name << "\n";
+ return false;
+ }
+ out << " }();\n"
+ << " return ret.transform([](auto size) {\n"
+ << " return std::make_pair(Token::k" << element.name
+ << ", size); });\n";
+ } else {
+ if (!write_matcher(out, element.definitions[0], sub_return_type, " ")) {
+ std::cerr << "Error in " << element.name << "\n";
+ return false;
+ }
+ }
+ } else if (std::ranges::all_of(
+ element.definitions, [](auto const& definition) {
+ return definition.symbols.size() == 1 &&
+ definition.symbols[0].optional ==
+ grammar::Symbol::Optional::kRequired &&
+ definition.symbols[0].type ==
+ grammar::Symbol::Type::kTerminal;
+ })) {
+ if (std::ranges::all_of(element.definitions, [](auto const& definition) {
+ return definition.symbols[0].value.size() == 1;
+ })) {
+ out << " if (!str.empty()) {\n"
+ << " switch (str.front()) {\n";
+ for (auto const& definition : element.definitions) {
+ out << " case '" << definition.symbols[0].value[0] << "':\n";
+ }
+ out << " return 1;\n"
+ << " default:\n"
+ << " break;\n"
+ << " }\n"
+ << " }\n"
+ << " return std::nullopt;\n";
+ } else {
+ auto builder = prefix_tree::builder();
+ for (auto const& definition : element.definitions) {
+ builder->add(definition.symbols[0].value);
+ }
+ auto tree = builder->build();
+ if (!tree.has_value()) {
+ std::cerr << "To large prefix tree\n";
+ return false;
+ }
+ out << " static const auto tree = ";
+ quote(out, tree.value()) << "sv;\n";
+ out << " return prefix_tree::lookup(tree, str)";
+ if (make_token) {
+ out << ".transform([](auto size) {\n"
+ << " return std::make_pair(Token::k" << element.name
+ << ", size); })";
+ }
+ out << ";\n";
+ }
+ } else {
+ bool first = true;
+ std::string_view ret_type;
+ switch (sub_return_type) {
+ case ReturnType::kTokenAndSize:
+ ret_type = "std::optional<std::pair<Token, size_t>>";
+ break;
+ case ReturnType::kInternalAndSize:
+ ret_type = "std::optional<std::pair<Internal, size_t>>";
+ break;
+ case ReturnType::kSize:
+ ret_type = "std::optional<size_t>";
+ break;
+ }
+ for (auto const& definition : element.definitions) {
+ if (first) {
+ first = false;
+ out << " auto tmp = [str]() -> " << ret_type << " {\n";
+ if (!write_matcher(out, definition, sub_return_type, " ")) {
+ std::cerr << "Error in " << element.name << "\n";
+ return false;
+ }
+ out << " }();\n";
+ out << " auto ret = tmp;\n";
+ } else {
+ out << " tmp = [str]() -> " << ret_type << " {\n";
+ if (!write_matcher(out, definition, sub_return_type, " ")) {
+ std::cerr << "Error in " << element.name << "\n";
+ return false;
+ }
+ out << " }();\n"
+ << " if (tmp.has_value()) {\n";
+ if (sub_return_type == ReturnType::kTokenAndSize) {
+ out << " if (!ret.has_value() || ret.value().second < "
+ "tmp.value().second) {\n";
+ } else {
+ out << " if (!ret.has_value() || ret.value() < tmp.value()) {\n";
+ }
+ out << " ret = tmp;\n"
+ << " }\n"
+ << " }\n";
+ }
+ }
+ if (make_token) {
+ out << " return ret.transform([](auto size) {\n"
+ << " return std::make_pair(Token::k" << element.name
+ << ", size); });\n";
+ } else {
+ out << " return ret;\n";
+ }
+ }
+
+ out << "}\n"
+ << "\n";
+ return true;
+}
+
+bool Generator::generate(std::string_view header_name,
+ std::string_view source_name, std::string const& ns,
+ std::string const& unicode_version,
+ grammar::Grammar& grammar) {
+ std::fstream header{std::string(header_name),
+ std::fstream::trunc | std::fstream::out};
+ std::fstream source{std::string(source_name),
+ std::fstream::trunc | std::fstream::out};
+
+ auto header_guard = make_define(header_name);
+
+ header << "#ifndef " << header_guard << "\n"
+ << "#define " << header_guard << "\n"
+ << "\n"
+ << "#include \"prefix_tree.hh\"\n"
+ << "\n"
+ << "#include <cstddef>\n"
+ << "#include <cstdint>\n"
+ << "#include <optional>\n"
+ << "#include <string_view>\n"
+ << "#include <utility>\n"
+ << "\n"
+ << "namespace " << ns << " {\n"
+ << "\n";
+
+ find_specific_elements(grammar.root());
+
+ find_all_elements(grammar.root());
+
+ for (auto& element : all_elements_) {
+ check_need_last(*element);
+ }
+
+ header << "enum class Token : "
+ << (specific_tokens_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n";
+ for (auto const& token : specific_tokens_) {
+ header << " k" << token << ",\n";
+ }
+ header << "};\n";
+
+ header << "\n"
+ << "[[nodiscard]]\n"
+ << "std::optional<std::pair<Token, size_t>>"
+ << " matchNext(std::string_view str);\n"
+ << "\n"
+ << "} // " << ns << "\n"
+ << "\n"
+ << "#endif // " << header_guard << "\n";
+
+ source << "#include \"" << header_name << "\"\n"
+ << "\n"
+ << "#include \"prefix_tree.hh\"\n"
+ << "#include \"u.hh\"\n"
+ << "#include \"u8.hh\"\n"
+ << "\n"
+ << "#include <cstddef>\n"
+ << "#include <optional>\n"
+ << "#include <string_view>\n"
+ << "#include <utility>\n"
+ << "\n"
+ << "using namespace std::literals::string_view_literals;\n"
+ << "\n"
+ << "// NOLINTBEGIN(readability-else-after-return)\n"
+ << "\n"
+ << "namespace " << ns << " {\n";
+
+ source << "namespace {\n"
+ << "\n";
+
+ write_character_class_matchers(source, unicode_version);
+
+ source << "\n"
+ << "enum class Internal : "
+ << (all_elements_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n"
+ << " UNDEFINED,\n";
+ for (auto* element : all_elements_) {
+ source << " k" << element->name << ",\n";
+ }
+ source << "};\n"
+ << "\n";
+
+ for (auto* element : all_elements_) {
+ declare_matcher(source, *element, get_return_type(*element));
+ }
+
+ if (std::ranges::any_of(all_elements_, [this, &source](auto* element) {
+ auto sts = get_return_type(*element);
+ return !write_matcher(source, *element, sts);
+ })) {
+ return false;
+ }
+
+ source << "\n"
+ << "} // namespace\n";
+
+ source << "\n"
+ << "std::optional<std::pair<Token, size_t>>"
+ << "matchNext(std::string_view str) {\n"
+ << " return match" << grammar.root().name << "(str);\n"
+ << "}"
+ << "\n"
+ << "} // namespace " << ns << "\n"
+ << "\n"
+ << "// NOLINTEND(readability-else-after-return)\n"
+ << "\n";
+
+ return true;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+ auto args = Args::create();
+ auto opt_help = args->option('h', "help", "display this text and exit");
+ auto opt_ns = args->option_argument('\0', "namespace", "ARG",
+ "Namespace for tokenizer");
+ auto opt_unicode =
+ args->option_argument('u', "unicode", "ARG", "Unicode version");
+ std::vector<std::string_view> arguments;
+ if (!args->run(argc, argv, &arguments)) {
+ args->print_error(std::cerr);
+ std::cerr << "Try `gen_tokens --help` for usage\n";
+ return 1;
+ }
+ if (opt_help->is_set()) {
+ std::cout << "Usage: `gen_tokens [OPTIONS...] tokens.grammar"
+ << " OUTPUT.hh OUTPUT.cc`\n"
+ << "Generates a tokenizer for grammar.\n"
+ << "\n";
+ args->print_help(std::cout);
+ return 0;
+ }
+ if (!opt_ns->is_set()) {
+ std::cerr << "No namespace given.\n"
+ << "Try `gen_tokens --help` for usage\n";
+ return 1;
+ }
+ if (!opt_unicode->is_set()) {
+ std::cerr << "No unicode version given.\n"
+ << "Try `gen_tokens --help` for usage\n";
+ return 1;
+ }
+ auto ns = opt_ns->argument();
+ auto unicode = opt_unicode->argument();
+ if (arguments.size() != 3) {
+ std::cerr << "Expecting three arguments. No more, no less.\n"
+ << "Try `gen_tokens --help` for usage\n";
+ return 1;
+ }
+
+ auto filename = std::string(arguments[0]);
+ auto reader = io::open(filename);
+ if (!reader.has_value()) {
+ std::cerr << "Unable to open " << filename << '\n';
+ return 1;
+ }
+ auto errors = src::file_errors(std::move(filename));
+ auto grammar =
+ grammar::load(std::move(reader.value()), kCharacterClassNames, *errors);
+ if (!grammar || errors->errors() > 0)
+ return 1;
+
+ Generator generator;
+ if (!generator.generate(arguments[1], arguments[2], ns, unicode, *grammar))
+ return 1;
+ return 0;
+}
diff --git a/src/gen_ugc.cc b/src/gen_ugc.cc
index 2670803..9100946 100644
--- a/src/gen_ugc.cc
+++ b/src/gen_ugc.cc
@@ -28,7 +28,7 @@ std::map<std::string, u::GeneralCategory, std::less<>> str2gc{
{"Lo", u::GeneralCategory::LETTER_OTHER},
{"Mn", u::GeneralCategory::MARK_NONSPACING},
- {"Mc", u::GeneralCategory::MARK_SPACING_COMBINDING},
+ {"Mc", u::GeneralCategory::MARK_SPACING_COMBINING},
{"Me", u::GeneralCategory::MARK_SPACING_ENCLOSING},
{"Nd", u::GeneralCategory::NUMBER_DIGIT},
@@ -277,6 +277,7 @@ int main(int argc, char** argv) {
args->option_argument('p', "prefix", "ARG", "Prefix for exported method");
std::vector<std::string_view> arguments;
if (!args->run(argc, argv, &arguments)) {
+ args->print_error(std::cerr);
std::cerr << "Try `gen_u --help` for usage\n";
return 1;
}
diff --git a/src/grammar.cc b/src/grammar.cc
new file mode 100644
index 0000000..25c4d64
--- /dev/null
+++ b/src/grammar.cc
@@ -0,0 +1,268 @@
+#include "grammar.hh"
+
+#include "errors.hh"
+#include "line.hh"
+#include "location.hh"
+#include "str.hh"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <format>
+#include <functional>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace grammar {
+
+namespace {
+
+class GrammarImpl : public Grammar {
+ public:
+ explicit GrammarImpl(std::vector<std::unique_ptr<Element>> elements)
+ : elements_(std::move(elements)) {}
+
+ [[nodiscard]]
+ Element const& root() const override {
+ return *elements_[0];
+ }
+
+ private:
+ std::vector<std::unique_ptr<Element>> elements_;
+};
+
+struct FirstPassElement {
+ src::Location loc;
+ std::vector<std::string> definitions;
+
+ explicit FirstPassElement(src::Location loc) : loc(loc) {}
+};
+
+class GrammarLoader {
+ public:
+ GrammarLoader(std::unique_ptr<io::Reader> reader,
+ std::vector<std::string> const& character_classes,
+ src::Errors& errors)
+ : reader_(line::open(std::move(reader))),
+ character_classes_(character_classes),
+ errors_(errors) {}
+
+ std::unique_ptr<Grammar> load() {
+ // Read whole file in a first pass, before parsing definitions.
+ std::map<std::string, FirstPassElement> first_pass_elements;
+ FirstPassElement* last_element = nullptr;
+
+ std::map<std::string_view, uint8_t, std::less<>> cc_lookup;
+ for (uint8_t i = 0; i < static_cast<uint8_t>(character_classes_.size());
+ ++i) {
+ cc_lookup.emplace(character_classes_[i], i);
+ }
+
+ while (true) {
+ auto line = reader_->read();
+ auto loc = src::Location(reader_->number(), 0);
+ if (!line.has_value()) {
+ if (line.error() == io::ReadError::Eof)
+ break;
+ errors_.err(loc, "Error reading");
+ return nullptr;
+ }
+ if (line.value().empty() || line.value().front() == '#')
+ continue;
+ if (line.value().front() == ' ') {
+ // Continue on last element
+ size_t i = 1;
+ while (i < line.value().size() && line.value()[i] == ' ')
+ ++i;
+ if (i == line.value().size()) {
+ errors_.err(loc, "Unexpected line, only spaces");
+ continue;
+ }
+ if (last_element == nullptr) {
+ errors_.err(loc, "Expected element before indented lines");
+ continue;
+ }
+ last_element->definitions.emplace_back(line.value().substr(i));
+ } else {
+ // New element
+ if (line.value().back() != ':') {
+ errors_.err(
+ loc,
+ "Unexpected line, not indented but also not ending with a ':'");
+ continue;
+ }
+
+ auto name = line.value().substr(0, line.value().size() - 1);
+ if (cc_lookup.contains(name)) {
+ errors_.warn(
+ loc, std::format("Element {} overrides character class", name));
+ }
+ auto pair = first_pass_elements.emplace(name, FirstPassElement(loc));
+ if (!pair.second) {
+ errors_.err(loc, std::format("Duplicate element {}", name));
+ }
+ last_element = &pair.first->second;
+ }
+ }
+
+ if (first_pass_elements.empty()) {
+ errors_.err(src::Location(reader_->number(), 0), "No elements found");
+ return nullptr;
+ }
+
+ std::vector<std::unique_ptr<Element>> second_pass_elements;
+ std::map<std::string_view, size_t, std::less<>> second_pass_lookup;
+ for (auto const& pair : first_pass_elements) {
+ auto element = std::make_unique<Element>();
+ element->name = pair.first;
+ second_pass_lookup.emplace(element->name, second_pass_elements.size());
+ second_pass_elements.emplace_back(std::move(element));
+ }
+
+ auto it = second_pass_elements.begin();
+ for (auto const& pair : first_pass_elements) {
+ auto const& element = *it++;
+ if (pair.second.definitions.empty()) {
+ errors_.err(pair.second.loc,
+ std::format("No definitions for {}", pair.first));
+ continue;
+ }
+ std::vector<std::string_view> in_symbols;
+ for (auto const& in_definition : pair.second.definitions) {
+ str::split(in_definition, in_symbols);
+
+ std::vector<Symbol> out_symbols;
+ bool exclude = false;
+ bool expect_not = false;
+ for (auto in_symbol : in_symbols) {
+ Symbol out_symbol;
+ if (exclude) {
+ if (in_symbol == "or")
+ continue;
+ out_symbol.optional = Symbol::Optional::kExcluded;
+ } else if (expect_not) {
+ expect_not = false;
+ if (in_symbol == "not") {
+ exclude = true;
+ } else {
+ errors_.err(pair.second.loc, "but is not followed by not");
+ }
+ continue;
+ }
+ if (in_symbol == "but") {
+ expect_not = true;
+ continue;
+ }
+ if (in_symbol.front() == '{' && in_symbol.back() == '}') {
+ if (exclude) {
+ errors_.err(pair.second.loc,
+ "Optional and exclude doesn't work together");
+ } else {
+ out_symbol.optional = Symbol::Optional::kZeroOrMore;
+ }
+ in_symbol = in_symbol.substr(1, in_symbol.size() - 2);
+ } else if (in_symbol.front() == '[' && in_symbol.back() == ']') {
+ if (exclude) {
+ errors_.err(pair.second.loc,
+ "Optional and exclude doesn't work together");
+ } else {
+ out_symbol.optional = Symbol::Optional::kZeroOrOne;
+ }
+ in_symbol = in_symbol.substr(1, in_symbol.size() - 2);
+ }
+ auto it2 = second_pass_lookup.find(in_symbol);
+ if (it2 != second_pass_lookup.end()) {
+ out_symbol.type = Symbol::Type::kNonTerminal;
+ out_symbol.element = second_pass_elements[it2->second].get();
+ } else {
+ auto it3 = cc_lookup.find(in_symbol);
+ if (it3 != cc_lookup.end()) {
+ out_symbol.type = Symbol::Type::kCharacterClass;
+ out_symbol.char_class = it3->second;
+ } else {
+ out_symbol.type = Symbol::Type::kTerminal;
+ out_symbol.value = in_symbol;
+ }
+ }
+ out_symbols.emplace_back(std::move(out_symbol));
+ }
+
+ if (expect_not) {
+ errors_.err(pair.second.loc, "but is not followed by not");
+ }
+
+ if (out_symbols.empty()) {
+ errors_.err(pair.second.loc, "no symbols found in definition");
+ continue;
+ }
+
+ element->definitions.emplace_back(
+ Definition{.symbols = std::move(out_symbols)});
+ }
+ }
+
+ // Find root and move it first (if needed)
+ std::vector<size_t> used(second_pass_elements.size(), 0);
+ for (auto const& element : second_pass_elements) {
+ for (auto const& definition : element->definitions) {
+ for (auto const& symbol : definition.symbols) {
+ switch (symbol.type) {
+ case Symbol::Type::kTerminal:
+ case Symbol::Type::kCharacterClass:
+ break;
+ case Symbol::Type::kNonTerminal:
+ used[second_pass_lookup.find(symbol.element->name)->second]++;
+ break;
+ }
+ }
+ }
+ }
+
+ std::optional<size_t> root_index;
+ for (size_t i = 0; i < used.size(); ++i) {
+ if (used[i] == 0) {
+ if (root_index.has_value()) {
+ errors_.warn(first_pass_elements.find(second_pass_elements[i]->name)
+ ->second.loc,
+ "Is not referenced but also not root");
+ } else {
+ root_index = i;
+ }
+ }
+ }
+
+ if (root_index.has_value()) {
+ if (root_index.value() != 0) {
+ std::swap(second_pass_elements[0],
+ second_pass_elements[root_index.value()]);
+ }
+ } else {
+ errors_.err(
+ first_pass_elements.find(second_pass_elements[0]->name)->second.loc,
+ "No root element found");
+ }
+
+ return std::make_unique<GrammarImpl>(std::move(second_pass_elements));
+ }
+
+ private:
+ std::unique_ptr<line::Reader> reader_;
+ std::vector<std::string> const& character_classes_;
+ src::Errors& errors_;
+};
+
+} // namespace
+
+std::unique_ptr<Grammar> load(std::unique_ptr<io::Reader> reader,
+ std::vector<std::string> const& character_classes,
+ src::Errors& errors) {
+ GrammarLoader loader(std::move(reader), character_classes, errors);
+ return loader.load();
+}
+
+} // namespace grammar
diff --git a/src/grammar.hh b/src/grammar.hh
new file mode 100644
index 0000000..13beec4
--- /dev/null
+++ b/src/grammar.hh
@@ -0,0 +1,80 @@
+#ifndef GRAMMAR_HH
+#define GRAMMAR_HH
+
+#include "errors.hh"
+#include "io.hh"
+
+#include <cstdint>
+#include <span>
+#include <string>
+#include <vector>
+
+namespace grammar {
+
+struct Element;
+
+struct Symbol {
+ enum class Type : uint8_t {
+ // value == terminal, as UTF-8
+ kTerminal,
+
+ // element != nullptr
+ kNonTerminal,
+
+ // char_class != 0
+ kCharacterClass,
+ };
+
+ Type type;
+
+ enum class Optional : uint8_t {
+ // Symbol is NOT optional.
+ kRequired,
+ // Symbol is optional.
+ kZeroOrOne,
+ // Symbol is optional and can repeat.
+ kZeroOrMore,
+ // Symbol should be excluded from previous symbol.
+ // Example: InputCharacter but not * or /
+ kExcluded,
+ };
+
+ Optional optional{Optional::kRequired};
+
+ uint8_t char_class{0};
+
+ Element const* element{nullptr};
+ std::string value;
+};
+
+struct Definition {
+ std::vector<Symbol> symbols;
+};
+
+struct Element {
+ std::string name;
+
+ std::vector<Definition> definitions;
+};
+
+class Grammar {
+ public:
+ virtual ~Grammar() = default;
+
+ [[nodiscard]]
+ virtual Element const& root() const = 0;
+
+ protected:
+ Grammar() = default;
+
+ Grammar(Grammar const&) = delete;
+ Grammar& operator=(Grammar const&) = delete;
+};
+
+std::unique_ptr<Grammar> load(std::unique_ptr<io::Reader> reader,
+ std::vector<std::string> const& character_classes,
+ src::Errors& errors);
+
+} // namespace grammar
+
+#endif // GRAMMAR_HH
diff --git a/src/java_tokens.cc b/src/java_tokens.cc
new file mode 100644
index 0000000..59748c1
--- /dev/null
+++ b/src/java_tokens.cc
@@ -0,0 +1,374 @@
+#include "java_tokens.hh"
+
+#include "errors.hh"
+#include "java_tokens_java-8.hh"
+#include "java_uescape.hh"
+#include "str.hh"
+#include "u8.hh"
+#include "uline.hh"
+
+#include <cassert>
+#include <charconv>
+#include <cstddef>
+#include <expected>
+#include <format>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <system_error>
+#include <utility>
+
+namespace java {
+
+namespace {
+
+template <typename MatchNext, typename MatchToken>
+class TokensImpl : public Tokens {
+ public:
+ TokensImpl(std::unique_ptr<io::Reader> reader,
+ std::unique_ptr<src::Errors> errors, TokensConfig config)
+ : reader_(u8::line::open(u8::java::open(std::move(reader)))),
+ errors_(std::move(errors)),
+ config_(config) {}
+
+ std::expected<Token, io::ReadError> read() override {
+ while (true) {
+ while (line_.empty()) {
+ auto maybe_line = reader_->read();
+ if (!maybe_line.has_value())
+ return std::unexpected(maybe_line.error());
+ line_ = maybe_line.value();
+ location_.line = reader_->number();
+ location_.column = 0;
+ }
+
+ Token token;
+ token.loc = location_;
+
+ auto maybe_token_pair = match_next_(line_);
+ if (maybe_token_pair.has_value()) {
+ token.str = line_.substr(0, maybe_token_pair->second);
+ location_.column += u8len(token.str);
+ line_ = line_.substr(maybe_token_pair->second);
+ switch (maybe_token_pair->first) {
+ case MatchToken::kBinaryIntegerLiteral:
+ handle_int_literal(token, /* base */ 2);
+ break;
+ case MatchToken::kBooleanLiteral:
+ token.type = Token::Type::kLiteralBoolean;
+ token.int_value = token.str != "false";
+ break;
+ case MatchToken::kCharacterLiteral:
+ token.type = Token::Type::kLiteralCharacter;
+ if (token.str[1] == '\\') {
+ token.int_value =
+ unescape(token.str.substr(1, token.str.size() - 2)).first;
+ } else {
+ auto* ptr =
+ reinterpret_cast<uint8_t const*>(token.str.data() + 1);
+ auto* end = ptr + token.str.size() - 2;
+ token.int_value = u8::read(ptr, end).value();
+ }
+ break;
+ case MatchToken::kDecimalFloatingPointLiteral:
+ handle_float_literal(token);
+ break;
+ case MatchToken::kDecimalIntegerLiteral:
+ handle_int_literal(token);
+ break;
+ case MatchToken::kEndOfLineComment:
+ token.type = Token::Type::kComment;
+ token.str = str::trim(token.str.substr(2));
+ break;
+ case MatchToken::kHexIntegerLiteral:
+ handle_int_literal(token, /* base */ 16);
+ break;
+ case MatchToken::kHexadecimalFloatingPointLiteral:
+ handle_float_literal(token, /* base */ 16);
+ break;
+ case MatchToken::kIdentifier:
+ token.type = Token::Type::kIdentifier;
+ break;
+ case MatchToken::kKeyword:
+ token.type = Token::Type::kKeyword;
+ break;
+ case MatchToken::kNullLiteral:
+ token.type = Token::Type::kLiteralNull;
+ break;
+ case MatchToken::kOctalIntegerLiteral:
+ handle_int_literal(token, /* base */ 8);
+ break;
+ case MatchToken::kOperator:
+ token.type = Token::Type::kOperator;
+ break;
+ case MatchToken::kSeparator:
+ token.type = Token::Type::kSeparator;
+ break;
+ case MatchToken::kStringLiteral:
+ token.type = Token::Type::kLiteralString;
+ token.str =
+ unescape_if_needed(token.str.substr(1, token.str.size() - 2));
+ break;
+ case MatchToken::kTraditionalComment: {
+ token.type = Token::Type::kComment;
+ size_t s = 2;
+ while (s < token.str.size() && token.str[s] == '*')
+ ++s;
+ token.str =
+ str::trim(token.str.substr(s, token.str.size() - 2 - s));
+ token.int_value = static_cast<int64_t>(s - 1);
+ // TODO: handle multiline
+ break;
+ }
+ case MatchToken::kWhiteSpace:
+ continue;
+ }
+ } else {
+ errors_->err(location_, std::format("Invalid token: {}", line_));
+ token.type = Token::Type::kError;
+ token.str = line_;
+ }
+ return token;
+ }
+ }
+
+ private:
+ void handle_int_literal_error(Token& token, std::string_view str,
+ std::errc err, int base) {
+ if (err == std::errc::result_out_of_range) {
+ // Java assumes two completent (so 0xffff_ffff is -1) and also, negative literals
+ // are read as positive (because the operator '-' is a separate token)
+ uint64_t tmp;
+ auto ret =
+ std::from_chars(str.data(), str.data() + str.size(), tmp, base);
+ if (ret.ec == std::errc()) {
+ token.type = ret.ptr < str.data() + str.size()
+ ? Token::Type::kLiteralLong
+ : Token::Type::kLiteralInt;
+ token.int_value = static_cast<int64_t>(tmp);
+ return;
+ }
+ }
+ errors_->err(location_,
+ std::format("Invalid integer literal: {}", token.str));
+ token.type = Token::Type::kError;
+ }
+
+ void handle_int_literal(Token& token, int base = 10) {
+ size_t prefix;
+ switch (base) {
+ case 16: // 0x
+ case 2: // 0b
+ prefix = 2;
+ break;
+ case 8: // 0
+ prefix = 1;
+ break;
+ default:
+ prefix = 0;
+ break;
+ }
+ std::optional<char> suffix;
+ if (token.str.find('_') == std::string_view::npos) {
+ auto ret = std::from_chars(token.str.data() + prefix,
+ token.str.data() + token.str.size(),
+ token.int_value, base);
+ if (ret.ec != std::errc()) {
+ handle_int_literal_error(token, token.str.substr(prefix), ret.ec, base);
+ return;
+ }
+ if (ret.ptr < token.str.data() + token.str.size())
+ suffix = *ret.ptr;
+ } else {
+ std::string tmp;
+ tmp.reserve(token.str.size() - prefix);
+ for (size_t i = prefix; i < token.str.size(); ++i) {
+ if (token.str[i] != '_') {
+ tmp.push_back(token.str[i]);
+ }
+ }
+ auto ret = std::from_chars(tmp.data(), tmp.data() + tmp.size(),
+ token.int_value, base);
+ if (ret.ec != std::errc()) {
+ handle_int_literal_error(token, tmp, ret.ec, base);
+ return;
+ }
+ if (ret.ptr < tmp.data() + tmp.size())
+ suffix = *ret.ptr;
+ }
+ if (suffix.has_value() &&
+ (suffix.value() == 'l' || suffix.value() == 'L')) {
+ token.type = Token::Type::kLiteralLong;
+ } else {
+ if (base == 10 &&
+ token.int_value >
+ static_cast<int64_t>(1) + std::numeric_limits<int32_t>::max()) {
+ errors_->err(location_,
+ std::format("Invalid integer literal: {}", token.str));
+ token.type = Token::Type::kError;
+ return;
+ }
+ if (std::cmp_greater(token.int_value,
+ std::numeric_limits<uint32_t>::max())) {
+ errors_->err(location_,
+ std::format("Invalid integer literal: {}", token.str));
+ token.type = Token::Type::kError;
+ return;
+ }
+ token.type = Token::Type::kLiteralInt;
+ token.int_value = static_cast<int32_t>(token.int_value);
+ }
+ }
+
+ void handle_float_literal(Token& token, int base = 10) {
+ size_t prefix;
+ std::chars_format fmt;
+ switch (base) {
+ case 16: // 0x
+ fmt = std::chars_format::general | std::chars_format::hex;
+ prefix = 2;
+ break;
+ default:
+ fmt = std::chars_format::general;
+ prefix = 0;
+ break;
+ }
+
+ std::from_chars_result ret;
+ if (token.str.ends_with("f") || token.str.ends_with("F")) {
+ // float and double do not parse exactly the same, so use a float parser for float.
+ float tmp;
+ ret = std::from_chars(token.str.data() + prefix,
+ token.str.data() + token.str.size(), tmp, fmt);
+ token.type = Token::Type::kLiteralFloatingPoint;
+ token.float_value = tmp;
+ } else {
+ ret = std::from_chars(token.str.data() + prefix,
+ token.str.data() + token.str.size(),
+ token.float_value, fmt);
+ token.type = Token::Type::kLiteralDoubleFloatingPoint;
+ }
+
+ if (ret.ec != std::errc()) {
+ // Java allows 0 with just a suffix, std::from_chars does not.
+ if (token.str == "0f" || token.str == "0F") {
+ token.type = Token::Type::kLiteralFloatingPoint;
+ token.float_value = 0;
+ return;
+ }
+ if (token.str == "0d" || token.str == "0D") {
+ token.type = Token::Type::kLiteralDoubleFloatingPoint;
+ token.float_value = 0;
+ return;
+ }
+ errors_->err(location_,
+ std::format("Invalid float literal: {}", token.str));
+ token.type = Token::Type::kError;
+ }
+ }
+
+ std::string_view unescape_if_needed(std::string_view str) {
+ auto back_slash = str.find('\\');
+ if (back_slash == std::string_view::npos)
+ return str;
+ unescape_tmp_.clear();
+ unescape_tmp_.reserve(str.size());
+ size_t last = 0;
+ uint8_t tmp[4];
+ while (true) {
+ unescape_tmp_.append(str, last, back_slash - last);
+ auto ret = unescape(str.substr(back_slash));
+ auto* ptr = tmp;
+ u8::write(ptr, tmp + sizeof(tmp), ret.first);
+ unescape_tmp_.append(
+ std::string_view(reinterpret_cast<char*>(tmp), ptr - tmp));
+ last = back_slash + ret.second;
+ back_slash = str.find('\\', last);
+ if (back_slash == std::string::npos) {
+ unescape_tmp_.append(str, last);
+ break;
+ }
+ }
+ return unescape_tmp_;
+ }
+
+ // Strings coming here have already been validated by the tokenizer
+ static std::pair<uint16_t, size_t> unescape(std::string_view in) {
+ assert(in.front() == '\\');
+ assert(in.size() > 1);
+ switch (in[1]) {
+ case 'b':
+ return std::make_pair(8, 2);
+ case 't':
+ return std::make_pair(9, 2);
+ case 'n':
+ return std::make_pair(10, 2);
+ case 'f':
+ return std::make_pair(12, 2);
+ case 'r':
+ return std::make_pair(13, 2);
+ case '"':
+ return std::make_pair(34, 2);
+ case '\'':
+ return std::make_pair(39, 2);
+ case '\\':
+ return std::make_pair(92, 2);
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7': {
+ uint8_t tmp;
+ auto ret = std::from_chars(in.data() + 1, in.data() + in.size(), tmp,
+ /* base */ 8);
+ return std::make_pair(tmp, ret.ptr - in.data());
+ }
+ default:
+ std::unreachable();
+ }
+ }
+
+ static size_t u8len(std::string_view str) {
+ auto* ptr = reinterpret_cast<uint8_t const*>(str.data());
+ auto* const end = ptr + str.size();
+ size_t count = 0;
+ while (u8::skip(ptr, end))
+ ++count;
+ return count;
+ }
+
+ std::unique_ptr<u8::line::Reader> reader_;
+ std::unique_ptr<src::Errors> errors_;
+ TokensConfig const config_;
+ MatchNext match_next_;
+ std::string_view line_;
+ Location location_;
+ std::string unescape_tmp_;
+};
+
+struct MatchNextJava8 {
+ std::optional<std::pair<java_8::Token, size_t>> operator()(
+ std::string_view str) const {
+ return java_8::matchNext(str);
+ }
+};
+
+} // namespace
+
+std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader,
+ std::unique_ptr<src::Errors> errors,
+ TokensConfig config) {
+ switch (config.version) {
+ case Version::kJava8:
+ return std::make_unique<TokensImpl<MatchNextJava8, java_8::Token>>(
+ std::move(reader), std::move(errors), config);
+ }
+ std::unreachable();
+}
+
+} // namespace java
diff --git a/src/java_tokens.hh b/src/java_tokens.hh
new file mode 100644
index 0000000..6fbefcb
--- /dev/null
+++ b/src/java_tokens.hh
@@ -0,0 +1,92 @@
+#ifndef JAVA_TOKENS_HH
+#define JAVA_TOKENS_HH
+
+#include "errors.hh"
+#include "io.hh"
+#include "java_version.hh" // IWYU pragma: export
+#include "location.hh"
+
+#include <expected>
+#include <memory>
+#include <string_view>
+
+namespace java {
+
+using src::Location;
+
+struct Token {
+ enum class Type : uint8_t {
+ // str is content of comment, excluding heading or trailing slash and star.
+ // int_value is number of stars at head, 0 for single line, 1 for /* and
+ // 2 for /** and so on.
+ kComment,
+
+ // str is identifier
+ kIdentifier,
+
+ // str is keyword, int_value is Keyword index
+ kKeyword,
+
+ // str is separator, int_value is Separator index
+ kSeparator,
+
+ // str is operator, int_value is Operator index
+ kOperator,
+
+ // int_value is literal value
+ kLiteralInt,
+
+ // int_value is literal value
+ kLiteralLong,
+
+ // str is literal value
+ kLiteralString,
+
+ // int_value is literal value as unicode code-point
+ kLiteralCharacter,
+
+ kLiteralNull,
+
+ // float_value is literal value
+ kLiteralFloatingPoint,
+
+ // float_value is literal value
+ kLiteralDoubleFloatingPoint,
+
+ // int_value is literal value, 0 = false, anything else = true
+ kLiteralBoolean,
+
+ kError,
+ };
+
+ Type type;
+ Location loc;
+ std::string_view str;
+ int64_t int_value{0};
+ double float_value{0};
+};
+
+struct TokensConfig {
+ // Source version of Java file
+ Version version = Version::kMax;
+};
+
+class Tokens {
+ public:
+ virtual ~Tokens() = default;
+
+ virtual std::expected<Token, io::ReadError> read() = 0;
+
+ protected:
+ Tokens() = default;
+ Tokens(Tokens const&) = delete;
+ Tokens& operator=(Tokens const&) = delete;
+};
+
+[[nodiscard]] std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader,
+ std::unique_ptr<src::Errors>,
+ TokensConfig config = {});
+
+} // namespace java
+
+#endif // JAVA_TOKENS_HH
diff --git a/src/java_version.hh b/src/java_version.hh
new file mode 100644
index 0000000..444ae36
--- /dev/null
+++ b/src/java_version.hh
@@ -0,0 +1,16 @@
+#ifndef JAVA_VERSION_HH
+#define JAVA_VERSION_HH
+
+#include <cstdint>
+
+namespace java {
+
+enum class Version : uint8_t {
+ kJava8 = 8,
+
+ kMax = kJava8,
+};
+
+} // namespace java
+
+#endif // JAVA_VERSION_HH
diff --git a/src/location.cc b/src/location.cc
new file mode 100644
index 0000000..3fa1075
--- /dev/null
+++ b/src/location.cc
@@ -0,0 +1,12 @@
+#include "location.hh"
+
+#include <ostream>
+
+namespace src {
+
+std::ostream& operator<<(std::ostream& out, Location const& loc) {
+ out << loc.line << ':' << loc.column;
+ return out;
+}
+
+} // namespace src
diff --git a/src/location.hh b/src/location.hh
new file mode 100644
index 0000000..1a210cb
--- /dev/null
+++ b/src/location.hh
@@ -0,0 +1,31 @@
+#ifndef LOCATION_HH
+#define LOCATION_HH
+
+#include <compare>
+#include <cstdint>
+#include <iosfwd>
+
+namespace src {
+
+struct Location {
+ uint64_t line;
+ uint16_t column;
+
+ constexpr Location() : line(0), column(0) {}
+
+ Location(uint64_t line, uint16_t column) : line(line), column(column) {}
+
+ [[nodiscard]]
+ std::strong_ordering operator<=>(Location const& loc) const {
+ auto ret = line <=> loc.line;
+ if (ret == std::strong_ordering::equal)
+ return column <=> loc.column;
+ return ret;
+ }
+};
+
+std::ostream& operator<<(std::ostream& out, Location const& loc);
+
+} // namespace src
+
+#endif // LOCATION_HH
diff --git a/src/prefix_tree.cc b/src/prefix_tree.cc
index f16df22..56466e8 100644
--- a/src/prefix_tree.cc
+++ b/src/prefix_tree.cc
@@ -42,6 +42,17 @@ class BuilderImpl : public Builder {
[[nodiscard]]
std::optional<std::string> build() const override {
+ auto ret = build(1);
+ if (!ret.has_value())
+ ret = build(2);
+ return ret;
+ }
+
+ private:
+ [[nodiscard]]
+ std::optional<std::string> build(uint8_t size) const {
+ assert(size > 0);
+
std::string tree;
std::string strings;
@@ -49,20 +60,19 @@ class BuilderImpl : public Builder {
for (auto const& str : strings_)
tmp.emplace(str);
- if (!write_tree(tmp, tree, strings))
+ if (!write_tree(tmp, tree, strings, size))
return std::nullopt;
std::string header;
- if (tree.size() > 0xffff)
+ write_u8(header, size);
+ if (!write_size(header, size, tree.size()))
return std::nullopt;
- write_u16(header, tree.size());
return header + tree + strings;
}
- private:
bool write_tree(std::set<std::string_view> const& input, std::string& tree,
- std::string& strings) const {
+ std::string& strings, uint8_t size) const {
std::map<char, std::set<std::string_view>> buckets;
bool match = false;
for (auto& str : input) {
@@ -76,8 +86,8 @@ class BuilderImpl : public Builder {
write_u8(tree, buckets.size() + (match ? 1 : 0));
if (match) {
write_u8(tree, 0);
- write_u16(tree, 0);
- write_u16(tree, 0);
+ write_size(tree, size, 0);
+ write_size(tree, size, 0);
}
std::string extra;
for (auto& pair : buckets) {
@@ -95,23 +105,23 @@ class BuilderImpl : public Builder {
}
write_u8(tree, 1 + str.size());
- if (strings.size() > 0xffff)
+ if (!write_size(tree, size, strings.size()))
return false;
- write_u16(tree, strings.size());
strings.push_back(pair.first);
strings.append(str);
if (extra.size() > 0xffff)
return false;
- write_u16(tree, extra.size());
+ if (!write_size(tree, size, extra.size()))
+ return false;
if (str.empty()) {
- if (!write_tree(pair.second, extra, strings))
+ if (!write_tree(pair.second, extra, strings, size))
return false;
} else {
std::set<std::string_view> tmp;
for (auto& str2 : pair.second) {
tmp.emplace(str2.substr(str.size()));
}
- if (!write_tree(tmp, extra, strings))
+ if (!write_tree(tmp, extra, strings, size))
return false;
}
}
@@ -120,12 +130,27 @@ class BuilderImpl : public Builder {
return true;
}
+ static bool write_size(std::string& str, uint8_t size, size_t value) {
+ if (size == 1) {
+ if (value > 0xff)
+ return false;
+ write_u8(str, value);
+ return true;
+ }
+ if (size == 2) {
+ if (value > 0xffff)
+ return false;
+ write_u16(str, value);
+ return true;
+ }
+ assert(false);
+ return false;
+ }
+
std::set<std::string> strings_;
};
-} // namespace
-
-std::optional<size_t> lookup(std::string_view tree, std::string_view str) {
+std::optional<size_t> lookup16(std::string_view tree, std::string_view str) {
size_t base_str = 2 + get_u16(tree, 0);
size_t node = 2;
std::optional<size_t> match;
@@ -168,6 +193,63 @@ std::optional<size_t> lookup(std::string_view tree, std::string_view str) {
return match;
}
+std::optional<size_t> lookup8(std::string_view tree, std::string_view str) {
+ size_t base_str = 1 + get_u8(tree, 0);
+ size_t node = 1;
+ std::optional<size_t> match;
+ std::optional<size_t> earlier_match;
+
+ while (node < base_str && !str.empty()) {
+ auto children = get_u8(tree, node);
+
+ if (children == 0) {
+ // Leaf
+ return match;
+ }
+
+ size_t child_node = node + 1;
+ size_t child_end = child_node + (static_cast<size_t>(children) * 3);
+ for (; child_node < child_end; child_node += 3) {
+ uint8_t len = get_u8(tree, child_node);
+ uint8_t offset = get_u8(tree, child_node + 1);
+
+ if (str.starts_with(tree.substr(base_str + offset, len))) {
+ // Match but not a leaf, always first in the list of children
+ if (len == 0) {
+ earlier_match = match;
+ continue;
+ }
+ match = match.value_or(0) + len;
+ str = str.substr(len);
+ auto jump = get_u8(tree, child_node + 2);
+ node = child_end + jump;
+ break;
+ }
+ }
+
+ if (child_node == child_end)
+ return earlier_match;
+ }
+
+ if (node == base_str)
+ return earlier_match;
+ return match;
+}
+
+} // namespace
+
+std::optional<size_t> lookup(std::string_view tree, std::string_view str) {
+ auto size = get_u8(tree, 0);
+ if (size == 1) {
+ return lookup8(tree.substr(1), str);
+ }
+ if (size == 2) {
+ return lookup16(tree.substr(1), str);
+ }
+ assert(false);
+ return std::nullopt;
+}
+
std::unique_ptr<Builder> builder() { return std::make_unique<BuilderImpl>(); }
} // namespace prefix_tree
diff --git a/src/str.cc b/src/str.cc
index bd7a654..44db3a6 100644
--- a/src/str.cc
+++ b/src/str.cc
@@ -6,6 +6,15 @@
namespace str {
+namespace {
+
+[[nodiscard]]
+inline bool is_space(char c) {
+ return c == ' ' || c == '\t' || c == '\r' || c == '\n';
+}
+
+} // namespace
+
void split(std::string_view str, std::vector<std::string_view>& out,
char separator, bool keep_empty) {
out.clear();
@@ -31,4 +40,14 @@ std::vector<std::string_view> split(std::string_view str, char separator,
return vec;
}
+std::string_view trim(std::string_view str) {
+ size_t s = 0;
+ size_t e = str.size();
+ while (s < e && is_space(str[s]))
+ ++s;
+ while (e > s && is_space(str[e - 1]))
+ --e;
+ return str.substr(s, e - s);
+}
+
} // namespace str
diff --git a/src/str.hh b/src/str.hh
index 58d5d32..e1ee549 100644
--- a/src/str.hh
+++ b/src/str.hh
@@ -13,6 +13,9 @@ void split(std::string_view str, std::vector<std::string_view>& out,
char separator = ' ',
bool keep_empty = false);
+[[nodiscard]]
+std::string_view trim(std::string_view str);
+
} // namespace str
#endif // STR_HH
diff --git a/src/ugc.hh b/src/ugc.hh
index c49d50f..206877f 100644
--- a/src/ugc.hh
+++ b/src/ugc.hh
@@ -13,7 +13,7 @@ enum class GeneralCategory : uint8_t {
LETTER_OTHER,
MARK_NONSPACING,
- MARK_SPACING_COMBINDING,
+ MARK_SPACING_COMBINING,
MARK_SPACING_ENCLOSING,
NUMBER_DIGIT,
diff --git a/test/java_tokens.cc b/test/java_tokens.cc
new file mode 100644
index 0000000..df37409
--- /dev/null
+++ b/test/java_tokens.cc
@@ -0,0 +1,551 @@
+#include "java_tokens.hh"
+
+#include "errors.hh"
+#include "io.hh"
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <string_view>
+#include <utility>
+
+using namespace std::literals::string_view_literals;
+
+namespace {
+
+class JavaTokens : public testing::TestWithParam<java::Version> {
+ protected:
+ static std::unique_ptr<src::Errors> make_errors() {
+ return src::file_errors(
+ testing::UnitTest::GetInstance()->current_test_info()->name());
+ }
+};
+
+} // namespace
+
+TEST_P(JavaTokens, empty_class) {
+ auto input = io::memory(R"(class Empty {
+})");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kKeyword, ret->type);
+ EXPECT_EQ("class", ret->str);
+ EXPECT_EQ(1, ret->loc.line);
+ EXPECT_EQ(0, ret->loc.column);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kIdentifier, ret->type);
+ EXPECT_EQ("Empty", ret->str);
+ EXPECT_EQ(1, ret->loc.line);
+ EXPECT_EQ(6, ret->loc.column);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kSeparator, ret->type);
+ EXPECT_EQ("{", ret->str);
+ EXPECT_EQ(1, ret->loc.line);
+ EXPECT_EQ(12, ret->loc.column);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kSeparator, ret->type);
+ EXPECT_EQ("}", ret->str);
+ EXPECT_EQ(2, ret->loc.line);
+ EXPECT_EQ(0, ret->loc.column);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, traditional_comment) {
+ auto input = io::memory(R"(/* this comment /* // /** ends here: */)");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kComment, ret->type);
+ EXPECT_EQ("this comment /* // /** ends here:", ret->str);
+ EXPECT_EQ(1, ret->int_value);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, single_line_comment) {
+ auto input = io::memory(R"(// this is a comment)");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kComment, ret->type);
+ EXPECT_EQ("this is a comment", ret->str);
+ EXPECT_EQ(0, ret->int_value);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, identifiers) {
+ auto input = io::memory(R"(String i3 αρετη MAX_VALUE isLetterOrDigit)");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kIdentifier, ret->type);
+ EXPECT_EQ("String", ret->str);
+ EXPECT_EQ(1, ret->loc.line);
+ EXPECT_EQ(0, ret->loc.column);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kIdentifier, ret->type);
+ EXPECT_EQ("i3", ret->str);
+ EXPECT_EQ(1, ret->loc.line);
+ EXPECT_EQ(7, ret->loc.column);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kIdentifier, ret->type);
+ EXPECT_EQ("αρετη", ret->str);
+ EXPECT_EQ(1, ret->loc.line);
+ EXPECT_EQ(10, ret->loc.column);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kIdentifier, ret->type);
+ EXPECT_EQ("MAX_VALUE", ret->str);
+ EXPECT_EQ(1, ret->loc.line);
+ EXPECT_EQ(16, ret->loc.column);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kIdentifier, ret->type);
+ EXPECT_EQ("isLetterOrDigit", ret->str);
+ EXPECT_EQ(1, ret->loc.line);
+ EXPECT_EQ(26, ret->loc.column);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, int_literals) {
+ auto input = io::memory(R"(
+0 2 0372 0xDada_Cafe 1996 0x00_FF__00_FF
+)");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(0L, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(2L, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(250L, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(-623195394L, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(1996L, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(16711935L, ret->int_value);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, long_literals) {
+ auto input = io::memory(R"(
+0l 0777L 0x100000000L 2_147_483_648L 0xC0B0L
+)");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(0, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(511, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(4294967296LL, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(2147483648LL, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(49328, ret->int_value);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, int_literal_min_max) {
+ auto input = io::memory(R"(
+2147483647
+-2147483648
+0x7fff_ffff
+0177_7777_7777
+0b0111_1111_1111_1111_1111_1111_1111_1111
+0x8000_0000
+0200_0000_0000
+0b1000_0000_0000_0000_0000_0000_0000_0000
+0xffff_ffff
+0377_7777_7777
+0b1111_1111_1111_1111_1111_1111_1111_1111
+)");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(2147483647L, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kOperator, ret->type);
+ EXPECT_EQ("-", ret->str);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(-2147483647L - 1, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(2147483647L, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(2147483647L, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(2147483647L, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(-2147483647L - 1, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(-2147483647L - 1, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(-2147483647L - 1, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(-1L, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(-1L, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type);
+ EXPECT_EQ(-1L, ret->int_value);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, long_literal_min_max) {
+ auto input = io::memory(R"(
+9223372036854775807L
+-9223372036854775808L
+0x7fff_ffff_ffff_ffffL
+07_7777_7777_7777_7777_7777L
+0b0111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111L
+0x8000_0000_0000_0000L
+010_0000_0000_0000_0000_0000L
+0b1000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000L
+0xffff_ffff_ffff_ffffL
+017_7777_7777_7777_7777_7777L
+0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111L
+)");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(9223372036854775807LL, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kOperator, ret->type);
+ EXPECT_EQ("-", ret->str);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(-9223372036854775807LL - 1, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(9223372036854775807LL, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(9223372036854775807LL, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(9223372036854775807LL, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(-9223372036854775807LL - 1, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(-9223372036854775807LL - 1, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(-9223372036854775807LL - 1, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(-1LL, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(-1LL, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type);
+ EXPECT_EQ(-1LL, ret->int_value);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, float_literals) {
+ auto input = io::memory(R"(
+1e1f 2.f .3f 0f 3.14f 6.022137e+23f
+)");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type);
+ EXPECT_EQ(1e1F, ret->float_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type);
+ EXPECT_EQ(2.F, ret->float_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type);
+ EXPECT_EQ(.3F, ret->float_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type);
+ EXPECT_EQ(0.F, ret->float_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type);
+ EXPECT_EQ(3.14F, ret->float_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type);
+ EXPECT_EQ(6.022137e+23F, ret->float_value);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, double_literals) {
+ auto input = io::memory(R"(
+1e1 2. .3 0.0 3.14 1e-9d 1e137
+)");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type);
+ EXPECT_EQ(1e1, ret->float_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type);
+ EXPECT_EQ(2., ret->float_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type);
+ EXPECT_EQ(.3, ret->float_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type);
+ EXPECT_EQ(0.0, ret->float_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type);
+ EXPECT_EQ(3.14, ret->float_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type);
+ EXPECT_EQ(1e-9, ret->float_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type);
+ EXPECT_EQ(1e137, ret->float_value);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, bool_literals) {
+ auto input = io::memory(R"(
+true false
+)");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralBoolean, ret->type);
+ EXPECT_TRUE(ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralBoolean, ret->type);
+ EXPECT_EQ(0, ret->int_value);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, char_literals) {
+ auto input = io::memory(R"(
+'a'
+
+'%'
+
+'\t'
+
+'\\'
+
+'\''
+
+'\u03a9'
+
+'\uFFFF'
+
+'\177'
+
+'™'
+)");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+ EXPECT_EQ('a', ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+ EXPECT_EQ('%', ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+ EXPECT_EQ('\t', ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+ EXPECT_EQ('\\', ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+ EXPECT_EQ('\'', ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+ EXPECT_EQ(0x3a9, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+ EXPECT_EQ(0xffff, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+ EXPECT_EQ(0177, ret->int_value);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type);
+ EXPECT_EQ(0x2122, ret->int_value);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, string_literals) {
+ auto input = io::memory(R"(
+""
+"\""
+"This is a string"
+"This is a " +
+ "two-line string"
+)");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralString, ret->type);
+ EXPECT_EQ("", ret->str);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralString, ret->type);
+ EXPECT_EQ(R"(")", ret->str);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralString, ret->type);
+ EXPECT_EQ("This is a string", ret->str);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralString, ret->type);
+ EXPECT_EQ("This is a ", ret->str);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kOperator, ret->type);
+ EXPECT_EQ("+", ret->str);
+ ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralString, ret->type);
+ EXPECT_EQ("two-line string", ret->str);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, string_escapes) {
+ auto input = io::memory(R"("\b\t\n\f\r\"\'\\\0\1\177")");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralString, ret->type);
+ EXPECT_EQ("\b\t\n\f\r\"'\\\0\1\177"sv, ret->str);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+TEST_P(JavaTokens, null) {
+ auto input = io::memory("null");
+ auto tokens = java::open(std::move(input), make_errors(),
+ java::TokensConfig{.version = GetParam()});
+ auto ret = tokens->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(java::Token::Type::kLiteralNull, ret->type);
+ ret = tokens->read();
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::ReadError::Eof, ret.error());
+}
+
+INSTANTIATE_TEST_SUITE_P(AllVersions, JavaTokens,
+ testing::Values(java::Version::kJava8));
diff --git a/test/java_uescape.cc b/test/java_uescape.cc
index a6657d8..a9f6746 100644
--- a/test/java_uescape.cc
+++ b/test/java_uescape.cc
@@ -132,6 +132,15 @@ TEST(java_uescape_u8, bad_surrogate) {
EXPECT_EQ(io::ReadError::InvalidData, ret.error());
}
+TEST(java_uescape_u8, issue) {
+ auto reader = u8::line::open(u8::java::open(io::memory(R"('\u03a9')")));
+ auto ret = reader->read();
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ("'Ω'", ret.value());
+ ret = reader->read();
+ EXPECT_FALSE(ret.has_value());
+}
+
TEST(java_uescape_u16, escaped_escape) {
auto reader = u16::java::open(io::memory(R"(\\u2122=\u2122)"));
std::u16string tmp;
diff --git a/test/prefix_tree.cc b/test/prefix_tree.cc
index 6c00adb..86c8990 100644
--- a/test/prefix_tree.cc
+++ b/test/prefix_tree.cc
@@ -1,5 +1,7 @@
#include "prefix_tree.hh"
+#include "str.hh"
+
#include <gtest/gtest.h>
TEST(prefix_tree, empty) {
@@ -45,3 +47,27 @@ TEST(prefix_tree, sanity) {
ASSERT_TRUE(ret.has_value());
EXPECT_EQ(3, ret.value());
}
+
+TEST(prefix_tree, many_and_long) {
+ auto builder = prefix_tree::builder();
+ for (auto str : str::split(
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do "
+ "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut "
+ "enim ad minim veniam, quis nostrud exercitation ullamco laboris "
+ "nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in "
+ "reprehenderit in voluptate velit esse cillum dolore eu fugiat "
+ "nulla pariatur. Excepteur sint occaecat cupidatat non proident, "
+ "sunt in culpa qui officia deserunt mollit anim id est laborum.")) {
+ builder->add(str);
+ }
+ auto tree = builder->build();
+ ASSERT_TRUE(tree.has_value());
+ auto ret = prefix_tree::lookup(tree.value(), "");
+ EXPECT_FALSE(ret.has_value());
+ ret = prefix_tree::lookup(tree.value(), "Lorem");
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(5, ret.value());
+ ret = prefix_tree::lookup(tree.value(), "cillum");
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(6, ret.value());
+}
diff --git a/test/u.cc b/test/u.cc
index d43109b..d94f74e 100644
--- a/test/u.cc
+++ b/test/u.cc
@@ -711,7 +711,7 @@ TEST_P(UnicodeVersionTest, lookup_gc) {
EXPECT_EQ(u::lookup_gc(0x483, GetParam()),
u::GeneralCategory::MARK_NONSPACING);
EXPECT_EQ(u::lookup_gc(0x93b, GetParam()),
- u::GeneralCategory::MARK_SPACING_COMBINDING);
+ u::GeneralCategory::MARK_SPACING_COMBINING);
EXPECT_EQ(u::lookup_gc(0x20de, GetParam()),
u::GeneralCategory::MARK_SPACING_ENCLOSING);