diff options
| author | Joel Klinghed <the_jk@spawned.biz> | 2025-09-27 20:11:32 +0200 |
|---|---|---|
| committer | Joel Klinghed <the_jk@spawned.biz> | 2025-09-28 22:48:24 +0200 |
| commit | c1ae5d53fb0fa7ceb9d6fc7a60c87df958ce37fe (patch) | |
| tree | f028a04619aa1b69f8b0aa72a5154f6ba1c09775 | |
| parent | 2f13baa843bd1fb5db6630a2823681ffaff9fb11 (diff) | |
WIPWIP
| -rw-r--r-- | data/java-8/tokens.grammar | 385 | ||||
| -rw-r--r-- | meson.build | 119 | ||||
| -rw-r--r-- | src/errors.cc | 116 | ||||
| -rw-r--r-- | src/errors.hh | 64 | ||||
| -rw-r--r-- | src/gen_tokens.cc | 1019 | ||||
| -rw-r--r-- | src/gen_ugc.cc | 3 | ||||
| -rw-r--r-- | src/grammar.cc | 268 | ||||
| -rw-r--r-- | src/grammar.hh | 80 | ||||
| -rw-r--r-- | src/java_tokens.cc | 374 | ||||
| -rw-r--r-- | src/java_tokens.hh | 92 | ||||
| -rw-r--r-- | src/java_version.hh | 16 | ||||
| -rw-r--r-- | src/location.cc | 12 | ||||
| -rw-r--r-- | src/location.hh | 31 | ||||
| -rw-r--r-- | src/prefix_tree.cc | 112 | ||||
| -rw-r--r-- | src/str.cc | 19 | ||||
| -rw-r--r-- | src/str.hh | 3 | ||||
| -rw-r--r-- | src/ugc.hh | 2 | ||||
| -rw-r--r-- | test/java_tokens.cc | 551 | ||||
| -rw-r--r-- | test/java_uescape.cc | 9 | ||||
| -rw-r--r-- | test/prefix_tree.cc | 26 | ||||
| -rw-r--r-- | test/u.cc | 2 |
21 files changed, 3285 insertions, 18 deletions
diff --git a/data/java-8/tokens.grammar b/data/java-8/tokens.grammar new file mode 100644 index 0000000..3521ac0 --- /dev/null +++ b/data/java-8/tokens.grammar @@ -0,0 +1,385 @@ +InputElement: + WhiteSpace + Comment + Token + +Token: + Identifier + Keyword + Literal + Separator + Operator + +Comment: + TraditionalComment + EndOfLineComment + +TraditionalComment: + / * CommentTail + +CommentTail: + * CommentTailStar + NotStar CommentTail + +CommentTailStar: + / + * CommentTailStar + NotStarNotSlash CommentTail + +NotStar: + InputCharacter but not * + LineTerminator + +NotStarNotSlash: + InputCharacter but not * or / + LineTerminator + +EndOfLineComment: + / / {InputCharacter} + +Identifier: + IdentifierChars but not Keyword or BooleanLiteral or NullLiteral + +IdentifierChars: + JavaLetter {JavaLetterOrDigit} + +Keyword: + abstract + continue + for + new + switch + assert + default + if + package + synchronized + boolean + do + goto + private + this + break + double + implements + protected + throw + byte + else + import + public + throws + case + enum + instanceof + return + transient + catch + extends + int + short + try + char + final + interface + static + void + class + finally + long + strictfp + volatile + const + float + native + super + while + +Literal: + IntegerLiteral + FloatingPointLiteral + BooleanLiteral + CharacterLiteral + StringLiteral + NullLiteral + +IntegerLiteral: + DecimalIntegerLiteral + HexIntegerLiteral + OctalIntegerLiteral + BinaryIntegerLiteral + +DecimalIntegerLiteral: + DecimalNumeral [IntegerTypeSuffix] + +HexIntegerLiteral: + HexNumeral [IntegerTypeSuffix] + +OctalIntegerLiteral: + OctalNumeral [IntegerTypeSuffix] + +BinaryIntegerLiteral: + BinaryNumeral [IntegerTypeSuffix] + +IntegerTypeSuffix: + l + L + +DecimalNumeral: + 0 + NonZeroDigit [Digits] + NonZeroDigit Underscores Digits + +NonZeroDigit: + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + +Digits: + Digit + Digit [DigitsAndUnderscores] Digit + +Digit: + 0 + NonZeroDigit + +DigitsAndUnderscores: + DigitOrUnderscore {DigitOrUnderscore} + +DigitOrUnderscore: + Digit + _ + +Underscores: + _ {_} + +HexNumeral: + 0 x HexDigits + 0 X HexDigits + +HexDigits: + HexDigit + HexDigit [HexDigitsAndUnderscores] HexDigit + +HexDigit: + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + a + b + c + d + e + f + A + B + C + D + E + F + +HexDigitsAndUnderscores: + HexDigitOrUnderscore {HexDigitOrUnderscore} + +HexDigitOrUnderscore: + HexDigit + _ + +OctalNumeral: + 0 OctalDigits + 0 Underscores OctalDigits + +OctalDigits: + OctalDigit + OctalDigit [OctalDigitsAndUnderscores] OctalDigit + +OctalDigit: + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + +OctalDigitsAndUnderscores: + OctalDigitOrUnderscore {OctalDigitOrUnderscore} + +OctalDigitOrUnderscore: + OctalDigit + _ + +BinaryNumeral: + 0 b BinaryDigits + 0 B BinaryDigits + +BinaryDigits: + BinaryDigit + BinaryDigit [BinaryDigitsAndUnderscores] BinaryDigit + +BinaryDigit: + 0 + 1 + +BinaryDigitsAndUnderscores: + BinaryDigitOrUnderscore {BinaryDigitOrUnderscore} + +BinaryDigitOrUnderscore: + BinaryDigit + _ + +FloatingPointLiteral: + DecimalFloatingPointLiteral + HexadecimalFloatingPointLiteral + +DecimalFloatingPointLiteral: + Digits . [Digits] [ExponentPart] [FloatTypeSuffix] + . Digits [ExponentPart] [FloatTypeSuffix] + Digits ExponentPart [FloatTypeSuffix] + Digits [ExponentPart] FloatTypeSuffix + +ExponentPart: + ExponentIndicator SignedInteger + +ExponentIndicator: + e + E + +SignedInteger: + [Sign] Digits + +Sign: + + + - + +FloatTypeSuffix: + f + F + d + D + +HexadecimalFloatingPointLiteral: + HexSignificand BinaryExponent [FloatTypeSuffix] + +HexSignificand: + HexNumeral [.] + 0 x [HexDigits] . HexDigits + 0 X [HexDigits] . HexDigits + +BinaryExponent: + BinaryExponentIndicator SignedInteger + +BinaryExponentIndicator: + p + P + +BooleanLiteral: + true + false + +CharacterLiteral: + ' SingleCharacter ' + ' EscapeSequence ' + +SingleCharacter: + InputCharacter but not ' or \ + +StringLiteral: + " {StringCharacter} " + +StringCharacter: + InputCharacter but not " or \ + EscapeSequence + +EscapeSequence: + \ b + \ t + \ n + \ f + \ r + \ " + \ ' + \ \ + OctalEscape + +OctalEscape: + \ OctalDigit + \ OctalDigit OctalDigit + \ ZeroToThree OctalDigit OctalDigit + +ZeroToThree: + 0 + 1 + 2 + 3 + +NullLiteral: + null + +Separator: + ( + ) + { + } + [ + ] + ; + , + . + ... + @ + :: + +Operator: + = + > + < + ! + ~ + ? + : + -> + == + >= + <= + != + && + || + ++ + -- + + + - + * + / + & + | + ^ + % + << + >> + >>> + += + -= + *= + /= + &= + |= + ^= + %= + <<= + >>= + >>>= diff --git a/meson.build b/meson.build index 1607088..d1f6908 100644 --- a/meson.build +++ b/meson.build @@ -12,6 +12,23 @@ project( ], ) +cpp_flags = [] +cpp_optional_flags = [] +if get_option('buildtype') == 'release' + # If asserts are disabled parameters and variables used for only that + # end up causing warnings + cpp_optional_flags += ['-Wno-unused-parameter', '-Wno-unused-variable', + '-Wno-unused-but-set-variable'] + cpp_flags += '-DNDEBUG' +endif +cpp = meson.get_compiler('cpp') +foreach flag : cpp_optional_flags + if cpp.has_argument(flag) + cpp_flags += flag + endif +endforeach +add_project_arguments([cpp_flags], language: 'cpp') + conf_data = configuration_data() conf_data.set('version', meson.project_version()) configure_file(input: 'src/config.h.in', @@ -191,6 +208,34 @@ java_uescape_dep = declare_dependency( dependencies: [buffer_dep, uio_dep], ) +src_lib = library( + 'src', + sources: [ + 'src/errors.cc', + 'src/errors.hh', + 'src/location.cc', + 'src/location.hh', + ], + include_directories: inc, +) +src_dep = declare_dependency( + link_with: src_lib, +) + +grammar_lib = library( + 'grammar', + sources: [ + 'src/grammar.cc', + 'src/grammar.hh', + ], + include_directories: inc, + dependencies: [io_dep, src_dep, str_dep], +) +grammar_dep = declare_dependency( + link_with: grammar_lib, + dependencies: [io_dep, src_dep, str_dep], +) + prefix_tree_lib = library( 'prefix_tree', sources: [ @@ -201,6 +246,69 @@ prefix_tree_lib = library( ) prefix_tree_dep = declare_dependency(link_with: prefix_tree_lib) +gen_tokens = executable( + 'gen_tokens', + sources: [ + 'src/gen_tokens.cc', + ], + include_directories: inc, + install : false, + dependencies : [ + args_dep, + grammar_dep, + prefix_tree_dep, + src_dep, + ], +) + +java_versions = [ + 'java-8', +] + +java_unicode_versions = { + 'java-8': '6.2.0', +} + +java_tokens_sources = [] +foreach java_version : java_versions + java_unicode_version = 'u' + java_unicode_versions[java_version].replace('.', '_') + java_tokens_sources += custom_target( + 'gen-tokens-' + java_version, + input: ['data/' + java_version + '/tokens.grammar'], + output: ['java_tokens_' + java_version + '.hh', + 'java_tokens_' + java_version + '.cc'], + command : [gen_tokens, '--namespace', + java_version.replace('.', '_').replace('-', '_'), + '--unicode', java_unicode_version, + '@INPUT@', '@OUTPUT@']) +endforeach + +java_lib = library( + 'java', + sources: [ + 'src/java_tokens.hh', + 'src/java_tokens.cc', + 'src/java_version.hh', + java_tokens_sources, + ], + include_directories: inc, + dependencies: [ + java_uescape_dep, + prefix_tree_dep, + src_dep, + str_dep, + uline_dep, + ] +) +java_dep = declare_dependency(link_with: java_lib, + dependencies: [ + java_uescape_dep, + prefix_tree_dep, + src_dep, + str_dep, + uline_dep, + ]) + jkc = executable( 'jkc', sources: [ @@ -351,12 +459,23 @@ test('java_uescape', executable( ], )) +test('java_tokens', executable( + 'test_java_tokens', + sources: ['test/java_tokens.cc'], + include_directories: inc, + dependencies: [ + java_dep, + test_dependencies, + ], +)) + test('prefix_tree', executable( 'test_prefix_tree', sources: ['test/prefix_tree.cc'], include_directories: inc, dependencies: [ prefix_tree_dep, + str_dep, test_dependencies, ], )) diff --git a/src/errors.cc b/src/errors.cc new file mode 100644 index 0000000..bf92c3b --- /dev/null +++ b/src/errors.cc @@ -0,0 +1,116 @@ +#include "errors.hh" + +#include "location.hh" + +#include <cstdint> +#include <format> +#include <iostream> +#include <memory> +#include <ostream> +#include <string> +#include <string_view> +#include <utility> + +namespace src { + +namespace { + +class FileErrors : public Errors { + public: + FileErrors(std::string filename, std::shared_ptr<ErrorsOutput> output) + : filename_(std::move(filename)), output_(std::move(output)) {} + + void err(Location loc, std::string_view msg) override { + ++errors_; + output_->println(std::format("{}:{}:{}: Error {}", filename_, loc.line, + loc.column, msg)); + } + + void warn(Location loc, std::string_view msg) override { + ++warnings_; + output_->println(std::format("{}:{}:{}: Warning {}", filename_, loc.line, + loc.column, msg)); + } + +#if !defined(NDEBUG) + void dbg(Location loc, std::string_view msg) override { + output_->println(std::format("{}:{}:{}: Debug {}", filename_, loc.line, + loc.column, msg)); + } +#endif + + [[nodiscard]] + uint64_t errors() const override { + return errors_; + } + + [[nodiscard]] + uint64_t warnings() const override { + return warnings_; + } + + private: + std::string const filename_; + std::shared_ptr<ErrorsOutput> output_; + uint64_t errors_{0}; + uint64_t warnings_{0}; +}; + +class IgnoreErrors : public Errors { + public: + IgnoreErrors() = default; + + void err(Location /* loc */, std::string_view /* msg */) override {} + void warn(Location /* loc */, std::string_view /* msg */) override {} + +#if !defined(NDEBUG) + void dbg(Location /* loc */, std::string_view /* msg */) override {} +#endif + + [[nodiscard]] + uint64_t errors() const override { + return 0; + } + + [[nodiscard]] + uint64_t warnings() const override { + return 0; + } +}; + +class OutputStreamErrorsOutput : public ErrorsOutput { + public: + explicit OutputStreamErrorsOutput(std::ostream& out) : out_(out) {} + + void println(std::string_view line) override { out_ << line << '\n'; } + + private: + std::ostream& out_; +}; + +} // namespace + +[[nodiscard]] +std::unique_ptr<Errors> file_errors(std::string filename, + std::shared_ptr<ErrorsOutput> output) { + if (!output) { + static std::shared_ptr<ErrorsOutput> g_stderr_output; + // TODO: Make thread-safe when needed + if (!g_stderr_output) + g_stderr_output = std::make_shared<OutputStreamErrorsOutput>(std::cerr); + output = g_stderr_output; + } + return std::make_unique<FileErrors>(std::move(filename), std::move(output)); +} + +[[nodiscard]] +std::unique_ptr<Errors> ignore_errors() { + return std::make_unique<IgnoreErrors>(); +} + +[[nodiscard]] +std::unique_ptr<ErrorsOutput> errors_output_ios(std::ostream& out) { + return std::make_unique<OutputStreamErrorsOutput>(out); +} + +} // namespace src diff --git a/src/errors.hh b/src/errors.hh new file mode 100644 index 0000000..d8b3d60 --- /dev/null +++ b/src/errors.hh @@ -0,0 +1,64 @@ +#ifndef ERRORS_HH +#define ERRORS_HH + +#include "location.hh" + +#include <iosfwd> +#include <memory> +#include <string> +#include <string_view> + +namespace src { + +class Errors { + public: + virtual ~Errors() = default; + + virtual void err(Location loc, std::string_view msg) = 0; + + virtual void warn(Location loc, std::string_view msg) = 0; + +#if !defined(NDEBUG) + virtual void dbg(Location loc, std::string_view msg) = 0; +#else + void dbg(Location, std::string_view) {} +#endif + + [[nodiscard]] + virtual uint64_t errors() const = 0; + [[nodiscard]] + virtual uint64_t warnings() const = 0; + + protected: + Errors() = default; + + Errors(Errors const&) = delete; + Errors& operator=(Errors const&) = delete; +}; + +class ErrorsOutput { + public: + virtual ~ErrorsOutput() = default; + + virtual void println(std::string_view line) = 0; + + protected: + ErrorsOutput() = default; + + ErrorsOutput(ErrorsOutput const&) = delete; + ErrorsOutput& operator=(ErrorsOutput const&) = delete; +}; + +[[nodiscard]] +std::unique_ptr<Errors> file_errors( + std::string filename, std::shared_ptr<ErrorsOutput> output = nullptr); + +[[nodiscard]] +std::unique_ptr<Errors> ignore_errors(); + +[[nodiscard]] +std::unique_ptr<ErrorsOutput> errors_output_ios(std::ostream& out); + +} // namespace src + +#endif // ERRORS_HH diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc new file mode 100644 index 0000000..2442c4f --- /dev/null +++ b/src/gen_tokens.cc @@ -0,0 +1,1019 @@ +#include "args.hh" +#include "errors.hh" +#include "grammar.hh" +#include "io.hh" +#include "prefix_tree.hh" + +#include <algorithm> +#include <cassert> +#include <charconv> +#include <cstddef> +#include <cstdint> +#include <fstream> +#include <iostream> +#include <set> +#include <string> +#include <string_view> +#include <utility> +#include <vector> + +namespace { + +enum class CharacterClass : uint8_t { + kWhiteSpace = 0, + kLineTerminator = 1, + kInputCharacter = 2, + kJavaLetter = 3, + kJavaLetterOrDigit = 4, +}; + +std::vector<std::string> const kCharacterClassNames( + {"WhiteSpace", "LineTerminator", "InputCharacter", "JavaLetter", + "JavaLetterOrDigit"}); + +enum class ReturnType : uint8_t { + kTokenAndSize, + kInternalAndSize, + kSize, +}; + +std::string make_define(std::string_view filename) { + std::string ret; + ret.reserve(filename.size()); + for (char c : filename) { + if ((c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_') { + ret.push_back(c); + } else if (c >= 'a' && c <= 'z') { + ret.push_back(static_cast<char>(c & ~0x20)); + } else { + ret.push_back('_'); + } + } + return ret; +} + +class Generator { + public: + bool generate(std::string_view header_name, std::string_view source_name, + std::string const& ns, std::string const& unicode_version, + grammar::Grammar& grammar); + + private: + void find_specific_elements(grammar::Element const& root); + void find_all_elements(grammar::Element const& root); + + void check_need_last(grammar::Element const& element); + bool find_report_last(grammar::Element const& root, + grammar::Element const& match); + + [[nodiscard]] + ReturnType get_return_type(grammar::Element const& element) const; + [[nodiscard]] + ReturnType get_return_type(uint8_t character_class) const; + + void write_matcher(std::ostream& out, grammar::Symbol const& symbol, + ReturnType return_type, std::string_view str_arg); + bool write_matcher(std::ostream& out, grammar::Definition const& definition, + ReturnType return_type, std::string_view indent); + bool write_matcher(std::ostream& out, grammar::Element const& element, + ReturnType return_type); + + std::set<std::string_view> above_specific_tokens_; + std::set<std::string_view> specific_tokens_; + std::set<grammar::Element const*> all_elements_; + std::set<std::string_view> copy_last_; + std::set<std::string_view> report_last_; +}; + +// Find the Elements that has at least one terminal or character class as symbol +// These will be the different tokens the tokenizer can return +void Generator::find_specific_elements(grammar::Element const& root) { + if (std::ranges::any_of(root.definitions, [](auto const& definition) { + return definition.symbols.size() > 1 || + definition.symbols[0].type == grammar::Symbol::Type::kTerminal; + })) { + specific_tokens_.insert(root.name); + return; + } + + above_specific_tokens_.insert(root.name); + + for (auto const& definition : root.definitions) { + for (auto const& symbol : definition.symbols) { + switch (symbol.type) { + case grammar::Symbol::Type::kNonTerminal: + find_specific_elements(*symbol.element); + break; + case grammar::Symbol::Type::kCharacterClass: + specific_tokens_.insert(kCharacterClassNames[symbol.char_class]); + break; + case grammar::Symbol::Type::kTerminal: + std::unreachable(); + } + } + } +} + +// Find elements that have definitions that has ZeroOrMore matches with a final condition +void Generator::check_need_last(grammar::Element const& element) { + for (auto const& definition : element.definitions) { + if (definition.symbols.size() < 2) + continue; + if (definition.symbols[definition.symbols.size() - 1].optional == + grammar::Symbol::Optional::kRequired && + definition.symbols[definition.symbols.size() - 1].type == + grammar::Symbol::Type::kNonTerminal && + (definition.symbols[definition.symbols.size() - 2].optional == + grammar::Symbol::Optional::kZeroOrOne || + definition.symbols[definition.symbols.size() - 2].optional == + grammar::Symbol::Optional::kZeroOrMore) && + definition.symbols[definition.symbols.size() - 2].type == + grammar::Symbol::Type::kNonTerminal) { + if (!copy_last_.contains(definition.symbols[definition.symbols.size() - 2] + .element->name)) { + find_report_last( + *definition.symbols[definition.symbols.size() - 2].element, + *definition.symbols[definition.symbols.size() - 1].element); + } + } + } +} + +// Find element that has match as a single definition, if so, return true and insert into +// report_last_. +bool Generator::find_report_last(grammar::Element const& root, + grammar::Element const& match) { + if (std::ranges::any_of(root.definitions, [&match](auto const& definition) { + return definition.symbols.size() == 1 && + definition.symbols[0].optional == + grammar::Symbol::Optional::kRequired && + definition.symbols[0].type == + grammar::Symbol::Type::kNonTerminal && + definition.symbols[0].element == &match; + })) { + report_last_.insert(root.name); + return true; + } + + for (auto const& definition : root.definitions) { + for (auto const& symbol : definition.symbols) { + if (symbol.type == grammar::Symbol::Type::kNonTerminal) { + if (!copy_last_.contains(symbol.element->name)) { + if (find_report_last(*symbol.element, match)) { + copy_last_.insert(root.name); + return true; + } + } + } + } + } + + return false; +} + +ReturnType Generator::get_return_type(grammar::Element const& element) const { + if (above_specific_tokens_.contains(element.name) || + specific_tokens_.contains(element.name)) + return ReturnType::kTokenAndSize; + if (copy_last_.contains(element.name) || report_last_.contains(element.name)) + return ReturnType::kInternalAndSize; + return ReturnType::kSize; +} + +ReturnType Generator::get_return_type(uint8_t character_class) const { + auto const& name = kCharacterClassNames[character_class]; + if (above_specific_tokens_.contains(name) || specific_tokens_.contains(name)) + return ReturnType::kTokenAndSize; + return ReturnType::kSize; +} + +void Generator::find_all_elements(grammar::Element const& root) { + auto pair = all_elements_.insert(&root); + if (!pair.second) + return; + + for (auto const& definition : root.definitions) { + for (auto const& symbol : definition.symbols) { + switch (symbol.type) { + case grammar::Symbol::Type::kNonTerminal: + find_all_elements(*symbol.element); + break; + case grammar::Symbol::Type::kCharacterClass: + case grammar::Symbol::Type::kTerminal: + break; + } + } + } +} + +void write_character_class_matchers(std::ostream& out, + std::string_view unicode_version) { + out << "[[nodiscard]]\n" + << "inline std::optional<size_t> matchLineTerminator" + << "(std::string_view str) {\n" + // Tokenizer reads one line at a time, + // so line terminator matches end of line. + << " return str.empty() ? std::make_optional<size_t>(0)" + << " : std::nullopt;\n" + << "}\n" + << "\n"; + + out << "[[nodiscard]]\n" + << "inline std::optional<size_t> matchInputCharacter" + << "(std::string_view str) {\n" + << " if (str.empty())\n" + << " return std::nullopt;\n" + // UnicodeInputCharacter but not CR or LF + << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n" + << " auto* ptr = start;\n" + << " u8::skip(ptr, start + str.size());\n" + << " return ptr - start;\n" + << "}\n" + << "\n"; + + out << "[[nodiscard]]\n" + << "inline std::optional<std::pair<Token, size_t>> matchWhiteSpace" + << "(std::string_view str) {\n" + // LineTerminator + << " if (auto ret = matchLineTerminator(str); ret.has_value())\n" + << " return std::make_pair(Token::kWhiteSpace, ret.value());\n" + << " switch (str.front()) {\n" + // the ASCII SP character, also known as "space" + << " case ' ':\n" + // the ASCII HT character, also known as "horizontal tab" + << " case '\\t':\n" + // the ASCII FF character, also known as "form feed" + << " case '\\f':\n" + << " return std::make_pair(Token::kWhiteSpace, 1);\n" + << " default:\n" + << " return std::nullopt;\n" + << " }\n" + << "}\n" + << "\n"; + + out << "[[nodiscard]]\n" + << "inline std::optional<size_t> matchJavaLetter" + << "(std::string_view str) {\n" + << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n" + << " auto* ptr = start;\n" + << " auto code = u8::read(ptr, ptr + str.size());\n" + << " if (!code.has_value())\n" + << " return std::nullopt;\n" + // any Unicode character that is a "Java letter" + // A "Java letter" is a character for which the method Character.isJavaIdentifierStart(int) returns true. + // A character may start a Java identifier if and only if one of the following conditions is true: + // isLetter(codePoint) returns true + // getType(codePoint) returns LETTER_NUMBER + // the referenced character is a currency symbol (such as '$') + // the referenced character is a connecting punctuation character (such as '_'). + << " switch (u::lookup_gc(code.value(), u::Version::" << unicode_version + << ")) {\n" + << " case u::GeneralCategory::LETTER_UPPERCASE:\n" + << " case u::GeneralCategory::LETTER_LOWERCASE:\n" + << " case u::GeneralCategory::LETTER_TITLECASE:\n" + << " case u::GeneralCategory::LETTER_MODIFIER:\n" + << " case u::GeneralCategory::LETTER_OTHER:\n" + << " case u::GeneralCategory::NUMBER_LETTER:\n" + << " case u::GeneralCategory::SYMBOL_CURRENCY:\n" + << " case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n" + << " return ptr - start;\n" + << " default:\n" + << " return std::nullopt;\n" + << " }\n" + << "}\n" + << "\n"; + + out << "[[nodiscard]]\n" + << "inline std::optional<size_t> matchJavaLetterOrDigit" + << "(std::string_view str) {\n" + << " auto* const start = reinterpret_cast<uint8_t const*>(str.data());\n" + << " auto* ptr = start;\n" + << " auto code = u8::read(ptr, ptr + str.size());\n" + << " if (!code.has_value())\n" + << " return std::nullopt;\n" + // any Unicode character that is a "Java letter-or-digit" + // A "Java letter-or-digit" is a character for which the method Character.isJavaIdentifierPart(int) returns true. + // A character may be part of a Java identifier if any of the following conditions are true: + // it is a letter + // it is a currency symbol (such as '$') + // it is a connecting punctuation character (such as '_') + // it is a digit + // it is a numeric letter (such as a Roman numeral character) + // it is a combining mark + // it is a non-spacing mark + // isIdentifierIgnorable returns true for the character + << " switch (u::lookup_gc(code.value(), u::Version::" << unicode_version + << ")) {\n" + << " case u::GeneralCategory::LETTER_UPPERCASE:\n" + << " case u::GeneralCategory::LETTER_LOWERCASE:\n" + << " case u::GeneralCategory::LETTER_TITLECASE:\n" + << " case u::GeneralCategory::LETTER_MODIFIER:\n" + << " case u::GeneralCategory::LETTER_OTHER:\n" + << " case u::GeneralCategory::SYMBOL_CURRENCY:\n" + << " case u::GeneralCategory::PUNCTUATION_CONNECTOR:\n" + << " case u::GeneralCategory::NUMBER_DIGIT:\n" + << " case u::GeneralCategory::NUMBER_LETTER:\n" + << " case u::GeneralCategory::MARK_SPACING_COMBINING:\n" + << " case u::GeneralCategory::MARK_NONSPACING:\n" + << " case u::GeneralCategory::OTHER_FORMAT:\n" + << " return ptr - start;\n" + << " case u::GeneralCategory::OTHER_CONTROL:\n" + << " if ((/* code.value() >= 0 && */ code.value() <= 8) ||\n" + << " (code.value() >= 0xe && code.value() <= 0x1b) ||\n" + << " (code.value() >= 0x7f && code.value() <= 0x9f))\n" + << " return 1;\n" + << " break;\n" + << " default:\n" + << " break;\n" + << " }\n" + << " return std::nullopt;\n" + << "}\n" + << "\n"; +} + +std::ostream& quote(std::ostream& out, std::string_view in) { + out << '"'; + bool avoid_digit = false; + for (auto c : in) { + if (c == '"' || c == '\\') { + out << '\\'; + } else if (c < ' ' || (c & 0x80)) { + char tmp[4]; + std::to_chars(tmp, tmp + sizeof(tmp), c & 0xff, 8).ptr[0] = 0; + out << "\\" << tmp; + avoid_digit = true; + continue; + } else if (avoid_digit) { + if (c >= '0' && c <= '7') { + out << "\" \""; + } + } + avoid_digit = false; + out << c; + } + out << '"'; + return out; +} + +void match_return_type(std::ostream& out, ReturnType in_return_type, + std::string_view in_name, ReturnType out_return_type) { + switch (out_return_type) { + case ReturnType::kTokenAndSize: + switch (in_return_type) { + case ReturnType::kTokenAndSize: + break; + case ReturnType::kInternalAndSize: + out << ".transform([](auto pair) { return std::make_pair(Token::k" + << in_name << ", pair.second); }"; + break; + case ReturnType::kSize: + out << ".transform([](auto size) { return std::make_pair(Token::k" + << in_name << ", size); })"; + break; + } + break; + case ReturnType::kInternalAndSize: + switch (in_return_type) { + case ReturnType::kTokenAndSize: + out << ".transform([](auto pair) { return std::make_pair(Internal::k" + << in_name << ", pair.second); }"; + break; + case ReturnType::kInternalAndSize: + break; + case ReturnType::kSize: + if (in_name.empty()) { + out << ".transform([](auto size) { return " + "std::make_pair(Internal::UNDEFINED, size); })"; + } else { + out << ".transform([](auto size) { return " + "std::make_pair(Internal::k" + << in_name << ", size); })"; + } + break; + } + break; + case ReturnType::kSize: + if (in_return_type != ReturnType::kSize) { + out << ".transform([](auto pair) { return pair.second; })"; + } + break; + } +} + +void Generator::write_matcher(std::ostream& out, grammar::Symbol const& symbol, + ReturnType return_type, + std::string_view str_arg) { + std::string_view in_name; + ReturnType in_return_type; + + switch (symbol.type) { + case grammar::Symbol::Type::kTerminal: + in_return_type = ReturnType::kSize; + out << "(" << str_arg << ".starts_with("; + quote(out, symbol.value); + out << ") ? std::make_optional<size_t>(" << symbol.value.size() + << ") : " << "std::nullopt)"; + break; + case grammar::Symbol::Type::kNonTerminal: + out << "match" << symbol.element->name << "(" << str_arg << ")"; + in_return_type = get_return_type(*symbol.element); + in_name = symbol.element->name; + break; + case grammar::Symbol::Type::kCharacterClass: + out << "match" << kCharacterClassNames[symbol.char_class] << "(" + << str_arg << ")"; + in_return_type = get_return_type(symbol.char_class); + in_name = kCharacterClassNames[symbol.char_class]; + break; + } + + match_return_type(out, in_return_type, in_name, return_type); +} + +bool Generator::write_matcher(std::ostream& out, + grammar::Definition const& definition, + ReturnType return_type, std::string_view indent) { + if (definition.symbols.size() == 1 && + definition.symbols[0].optional == grammar::Symbol::Optional::kRequired) { + out << indent << "return "; + write_matcher(out, definition.symbols[0], return_type, "str"); + out << ";\n"; + return true; + } + + std::string_view size_suffix; + switch (return_type) { + case ReturnType::kTokenAndSize: + case ReturnType::kInternalAndSize: + size_suffix = "->second"; + break; + case ReturnType::kSize: + size_suffix = ".value()"; + break; + } + + if (definition.symbols.size() > 1 && + definition.symbols[0].optional == grammar::Symbol::Optional::kRequired && + definition.symbols[1].optional == grammar::Symbol::Optional::kExcluded) { + bool first = true; + for (auto const& symbol : definition.symbols) { + if (first) { + out << indent << "auto first_ret = "; + write_matcher(out, symbol, return_type, "str"); + out << ";\n" + << indent << "if (!first_ret.has_value())\n" + << indent << " return first_ret;\n" + << indent << "std::optional<size_t> ret;\n" + << indent << "auto tmp = str.substr(0, first_ret" << size_suffix + << ");\n"; + first = false; + } else { + if (symbol.optional != grammar::Symbol::Optional::kExcluded) { + std::cerr << "Non-excluded after at least one excluded\n"; + return false; + } + out << indent << "ret = "; + write_matcher(out, symbol, ReturnType::kSize, "tmp"); + out << ";\n" + << indent << "if (ret.has_value() && ret.value() == tmp.size())\n" + << indent << " return std::nullopt;\n"; + } + } + out << indent << "return first_ret;\n"; + return true; + } + + if (std::ranges::all_of(definition.symbols, [](auto const& symbol) { + return symbol.optional == grammar::Symbol::Optional::kRequired; + })) { + out << indent << "size_t tot = 0;\n"; + bool first = true; + for (auto const& symbol : definition.symbols) { + std::string indent2(indent); + if (first) { + out << indent2 << "auto ret = "; + write_matcher(out, symbol, return_type, "str"); + out << ";\n"; + first = false; + } else { + out << indent2 << "ret = "; + write_matcher(out, symbol, return_type, "str.substr(tot)"); + out << ";\n"; + } + out << indent2 << "if (!ret.has_value())\n" + << indent2 << " return ret;\n"; + out << indent2 << "tot += ret" << size_suffix << ";\n"; + } + switch (return_type) { + case ReturnType::kInternalAndSize: + // Return last internal + out << indent << "return std::make_pair(ret->first, tot);\n"; + break; + case ReturnType::kTokenAndSize: + std::cerr << "Unable to return token and size\n"; + return false; + case ReturnType::kSize: + out << indent << "return tot;\n"; + break; + } + return true; + } + + out << indent << "size_t tot = 0;\n"; + bool last_internal = false; + switch (return_type) { + case ReturnType::kInternalAndSize: + last_internal = true; + out << indent << "std::optional<Internal> last_internal;\n"; + break; + case ReturnType::kTokenAndSize: + case ReturnType::kSize: + break; + } + bool at_least_one_required = false; + bool first = true; + bool first_internal = true; + bool next_internal = false; + for (size_t i = 0; i < definition.symbols.size(); ++i) { + auto const& symbol = definition.symbols[i]; + std::string indent2(indent); + bool have_internal = next_internal; + next_internal = false; + ReturnType symbol_return_type = return_type; + + if (symbol.optional != grammar::Symbol::Optional::kRequired && + i + 1 < definition.symbols.size() && + definition.symbols[i + 1].optional == + grammar::Symbol::Optional::kRequired && + definition.symbols[i + 1].type == grammar::Symbol::Type::kNonTerminal) { + symbol_return_type = ReturnType::kInternalAndSize; + next_internal = true; + } + + switch (symbol_return_type) { + case ReturnType::kTokenAndSize: + case ReturnType::kInternalAndSize: + size_suffix = "->second"; + break; + case ReturnType::kSize: + size_suffix = ".value()"; + break; + } + + switch (symbol.optional) { + case grammar::Symbol::Optional::kRequired: + at_least_one_required = true; + break; + case grammar::Symbol::Optional::kZeroOrOne: + break; + case grammar::Symbol::Optional::kZeroOrMore: + if (first) { + switch (symbol_return_type) { + case ReturnType::kTokenAndSize: + out << indent << "std::optional<std::pair<Token, size_t>> ret;\n"; + break; + case ReturnType::kInternalAndSize: + out << indent + << "std::optional<std::pair<Internal, size_t>> ret;\n"; + break; + case ReturnType::kSize: + out << indent << "std::optional<size_t> ret;\n"; + break; + } + first = false; + } + out << indent << "while (true) {\n"; + indent2 += " "; + break; + case grammar::Symbol::Optional::kExcluded: + std::cerr << "Excluded mixed with conditional\n"; + return false; + } + if (symbol_return_type == return_type) { + if (first) { + out << indent2 << "auto ret = "; + write_matcher(out, symbol, symbol_return_type, "str"); + first = false; + } else { + out << indent2 << "ret = "; + write_matcher(out, symbol, symbol_return_type, "str.substr(tot)"); + } + out << ";\n"; + } else { + if (first_internal) { + out << indent2 << "auto ret_internal = "; + write_matcher(out, symbol, symbol_return_type, + first ? "str" : "str.substr(tot)"); + first_internal = false; + } else { + out << indent2 << "ret_internal = "; + write_matcher(out, symbol, symbol_return_type, "str.substr(tot)"); + } + out << ";\n"; + if (first) { + out << indent2 << "auto ret = ret_internal"; + first = false; + } else { + out << indent2 << "ret = ret_internal"; + } + match_return_type(out, symbol_return_type, "", return_type); + out << ";\n"; + } + switch (symbol.optional) { + case grammar::Symbol::Optional::kRequired: + out << indent2 << "if (!ret.has_value()) {\n"; + if (have_internal && + symbol.type == grammar::Symbol::Type::kNonTerminal) { + out << indent2 + << " if (!ret_internal.has_value() || ret_internal->first != " + "Internal::k" + << symbol.element->name << ")\n" + << indent2 << " return ret;\n"; + } else { + out << indent2 << " return ret;\n"; + } + out << indent2 << "} else {\n" + << indent2 << " tot += ret" << size_suffix << ";\n"; + if (last_internal) + out << indent2 << " last_internal = ret->first;\n"; + out << indent2 << "}\n"; + break; + case grammar::Symbol::Optional::kZeroOrOne: + if (symbol_return_type == ReturnType::kTokenAndSize) { + out << indent2 << "tot += ret.has_value() ? ret->second : 0;\n"; + } else { + out << indent2 << "tot += ret.value_or(0);\n"; + } + if (last_internal) + out << indent2 << "if (ret.has_value())\n" + << indent2 << " last_internal = ret->first;\n"; + break; + case grammar::Symbol::Optional::kZeroOrMore: + out << indent2 << "if (!ret.has_value())\n" + << indent2 << " break;\n" + << indent2 << "tot += ret" << size_suffix << ";\n"; + if (last_internal) + out << indent2 << "last_internal = ret->first;\n"; + out << indent << "}\n"; + break; + case grammar::Symbol::Optional::kExcluded: + assert(false); + break; + } + } + switch (return_type) { + case ReturnType::kInternalAndSize: + // Return last internal + if (at_least_one_required) { + out << indent << "return std::make_pair(last_internal.value(), tot);\n"; + } else { + out << indent << "if (last_internal.has_value())\n" + << indent + << " return std::make_pair(last_internal.value(), tot);\n" + << indent << "return std::make_pair(Internal::UNDEFINED, tot);\n"; + } + break; + case ReturnType::kTokenAndSize: + std::cerr << "Unable to return token and size\n"; + return false; + case ReturnType::kSize: + out << indent << "return tot;\n"; + break; + } + return true; +} + +void declare_matcher(std::ostream& out, grammar::Element const& element, + ReturnType return_type) { + switch (return_type) { + case ReturnType::kSize: + out << "[[nodiscard]]\n" + << "inline std::optional<size_t> match" << element.name + << "(std::string_view str);\n"; + break; + case ReturnType::kTokenAndSize: + out << "[[nodiscard]]\n" + << "inline std::optional<std::pair<Token, size_t>> match" + << element.name << "(std::string_view str);\n"; + break; + case ReturnType::kInternalAndSize: + out << "[[nodiscard]]\n" + << "inline std::optional<std::pair<Internal, size_t>> match" + << element.name << "(std::string_view str);\n"; + break; + } +} + +bool Generator::write_matcher(std::ostream& out, + grammar::Element const& element, + ReturnType return_type) { + ReturnType sub_return_type = return_type; + bool make_token = false; + + switch (return_type) { + case ReturnType::kSize: + out << "[[nodiscard]]\n" + << "inline std::optional<size_t> match" << element.name + << "(std::string_view str) {\n"; + break; + case ReturnType::kTokenAndSize: + out << "[[nodiscard]]\n" + << "inline std::optional<std::pair<Token, size_t>> match" + << element.name << "(std::string_view str) {\n"; + + if (specific_tokens_.contains(element.name)) { + sub_return_type = ReturnType::kSize; + make_token = true; + } + break; + case ReturnType::kInternalAndSize: + out << "[[nodiscard]]\n" + << "inline std::optional<std::pair<Internal, size_t>> match" + << element.name << "(std::string_view str) {\n"; + break; + } + + if (element.definitions.size() == 1) { + if (make_token) { + out << " auto ret = [str]() -> std::optional<size_t> {\n"; + if (!write_matcher(out, element.definitions[0], sub_return_type, + " ")) { + std::cerr << "Error in " << element.name << "\n"; + return false; + } + out << " }();\n" + << " return ret.transform([](auto size) {\n" + << " return std::make_pair(Token::k" << element.name + << ", size); });\n"; + } else { + if (!write_matcher(out, element.definitions[0], sub_return_type, " ")) { + std::cerr << "Error in " << element.name << "\n"; + return false; + } + } + } else if (std::ranges::all_of( + element.definitions, [](auto const& definition) { + return definition.symbols.size() == 1 && + definition.symbols[0].optional == + grammar::Symbol::Optional::kRequired && + definition.symbols[0].type == + grammar::Symbol::Type::kTerminal; + })) { + if (std::ranges::all_of(element.definitions, [](auto const& definition) { + return definition.symbols[0].value.size() == 1; + })) { + out << " if (!str.empty()) {\n" + << " switch (str.front()) {\n"; + for (auto const& definition : element.definitions) { + out << " case '" << definition.symbols[0].value[0] << "':\n"; + } + out << " return 1;\n" + << " default:\n" + << " break;\n" + << " }\n" + << " }\n" + << " return std::nullopt;\n"; + } else { + auto builder = prefix_tree::builder(); + for (auto const& definition : element.definitions) { + builder->add(definition.symbols[0].value); + } + auto tree = builder->build(); + if (!tree.has_value()) { + std::cerr << "To large prefix tree\n"; + return false; + } + out << " static const auto tree = "; + quote(out, tree.value()) << "sv;\n"; + out << " return prefix_tree::lookup(tree, str)"; + if (make_token) { + out << ".transform([](auto size) {\n" + << " return std::make_pair(Token::k" << element.name + << ", size); })"; + } + out << ";\n"; + } + } else { + bool first = true; + std::string_view ret_type; + switch (sub_return_type) { + case ReturnType::kTokenAndSize: + ret_type = "std::optional<std::pair<Token, size_t>>"; + break; + case ReturnType::kInternalAndSize: + ret_type = "std::optional<std::pair<Internal, size_t>>"; + break; + case ReturnType::kSize: + ret_type = "std::optional<size_t>"; + break; + } + for (auto const& definition : element.definitions) { + if (first) { + first = false; + out << " auto tmp = [str]() -> " << ret_type << " {\n"; + if (!write_matcher(out, definition, sub_return_type, " ")) { + std::cerr << "Error in " << element.name << "\n"; + return false; + } + out << " }();\n"; + out << " auto ret = tmp;\n"; + } else { + out << " tmp = [str]() -> " << ret_type << " {\n"; + if (!write_matcher(out, definition, sub_return_type, " ")) { + std::cerr << "Error in " << element.name << "\n"; + return false; + } + out << " }();\n" + << " if (tmp.has_value()) {\n"; + if (sub_return_type == ReturnType::kTokenAndSize) { + out << " if (!ret.has_value() || ret.value().second < " + "tmp.value().second) {\n"; + } else { + out << " if (!ret.has_value() || ret.value() < tmp.value()) {\n"; + } + out << " ret = tmp;\n" + << " }\n" + << " }\n"; + } + } + if (make_token) { + out << " return ret.transform([](auto size) {\n" + << " return std::make_pair(Token::k" << element.name + << ", size); });\n"; + } else { + out << " return ret;\n"; + } + } + + out << "}\n" + << "\n"; + return true; +} + +bool Generator::generate(std::string_view header_name, + std::string_view source_name, std::string const& ns, + std::string const& unicode_version, + grammar::Grammar& grammar) { + std::fstream header{std::string(header_name), + std::fstream::trunc | std::fstream::out}; + std::fstream source{std::string(source_name), + std::fstream::trunc | std::fstream::out}; + + auto header_guard = make_define(header_name); + + header << "#ifndef " << header_guard << "\n" + << "#define " << header_guard << "\n" + << "\n" + << "#include \"prefix_tree.hh\"\n" + << "\n" + << "#include <cstddef>\n" + << "#include <cstdint>\n" + << "#include <optional>\n" + << "#include <string_view>\n" + << "#include <utility>\n" + << "\n" + << "namespace " << ns << " {\n" + << "\n"; + + find_specific_elements(grammar.root()); + + find_all_elements(grammar.root()); + + for (auto& element : all_elements_) { + check_need_last(*element); + } + + header << "enum class Token : " + << (specific_tokens_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n"; + for (auto const& token : specific_tokens_) { + header << " k" << token << ",\n"; + } + header << "};\n"; + + header << "\n" + << "[[nodiscard]]\n" + << "std::optional<std::pair<Token, size_t>>" + << " matchNext(std::string_view str);\n" + << "\n" + << "} // " << ns << "\n" + << "\n" + << "#endif // " << header_guard << "\n"; + + source << "#include \"" << header_name << "\"\n" + << "\n" + << "#include \"prefix_tree.hh\"\n" + << "#include \"u.hh\"\n" + << "#include \"u8.hh\"\n" + << "\n" + << "#include <cstddef>\n" + << "#include <optional>\n" + << "#include <string_view>\n" + << "#include <utility>\n" + << "\n" + << "using namespace std::literals::string_view_literals;\n" + << "\n" + << "// NOLINTBEGIN(readability-else-after-return)\n" + << "\n" + << "namespace " << ns << " {\n"; + + source << "namespace {\n" + << "\n"; + + write_character_class_matchers(source, unicode_version); + + source << "\n" + << "enum class Internal : " + << (all_elements_.size() < 256 ? "uint8_t" : "uint16_t") << " {\n" + << " UNDEFINED,\n"; + for (auto* element : all_elements_) { + source << " k" << element->name << ",\n"; + } + source << "};\n" + << "\n"; + + for (auto* element : all_elements_) { + declare_matcher(source, *element, get_return_type(*element)); + } + + if (std::ranges::any_of(all_elements_, [this, &source](auto* element) { + auto sts = get_return_type(*element); + return !write_matcher(source, *element, sts); + })) { + return false; + } + + source << "\n" + << "} // namespace\n"; + + source << "\n" + << "std::optional<std::pair<Token, size_t>>" + << "matchNext(std::string_view str) {\n" + << " return match" << grammar.root().name << "(str);\n" + << "}" + << "\n" + << "} // namespace " << ns << "\n" + << "\n" + << "// NOLINTEND(readability-else-after-return)\n" + << "\n"; + + return true; +} + +} // namespace + +int main(int argc, char** argv) { + auto args = Args::create(); + auto opt_help = args->option('h', "help", "display this text and exit"); + auto opt_ns = args->option_argument('\0', "namespace", "ARG", + "Namespace for tokenizer"); + auto opt_unicode = + args->option_argument('u', "unicode", "ARG", "Unicode version"); + std::vector<std::string_view> arguments; + if (!args->run(argc, argv, &arguments)) { + args->print_error(std::cerr); + std::cerr << "Try `gen_tokens --help` for usage\n"; + return 1; + } + if (opt_help->is_set()) { + std::cout << "Usage: `gen_tokens [OPTIONS...] tokens.grammar" + << " OUTPUT.hh OUTPUT.cc`\n" + << "Generates a tokenizer for grammar.\n" + << "\n"; + args->print_help(std::cout); + return 0; + } + if (!opt_ns->is_set()) { + std::cerr << "No namespace given.\n" + << "Try `gen_tokens --help` for usage\n"; + return 1; + } + if (!opt_unicode->is_set()) { + std::cerr << "No unicode version given.\n" + << "Try `gen_tokens --help` for usage\n"; + return 1; + } + auto ns = opt_ns->argument(); + auto unicode = opt_unicode->argument(); + if (arguments.size() != 3) { + std::cerr << "Expecting three arguments. No more, no less.\n" + << "Try `gen_tokens --help` for usage\n"; + return 1; + } + + auto filename = std::string(arguments[0]); + auto reader = io::open(filename); + if (!reader.has_value()) { + std::cerr << "Unable to open " << filename << '\n'; + return 1; + } + auto errors = src::file_errors(std::move(filename)); + auto grammar = + grammar::load(std::move(reader.value()), kCharacterClassNames, *errors); + if (!grammar || errors->errors() > 0) + return 1; + + Generator generator; + if (!generator.generate(arguments[1], arguments[2], ns, unicode, *grammar)) + return 1; + return 0; +} diff --git a/src/gen_ugc.cc b/src/gen_ugc.cc index 2670803..9100946 100644 --- a/src/gen_ugc.cc +++ b/src/gen_ugc.cc @@ -28,7 +28,7 @@ std::map<std::string, u::GeneralCategory, std::less<>> str2gc{ {"Lo", u::GeneralCategory::LETTER_OTHER}, {"Mn", u::GeneralCategory::MARK_NONSPACING}, - {"Mc", u::GeneralCategory::MARK_SPACING_COMBINDING}, + {"Mc", u::GeneralCategory::MARK_SPACING_COMBINING}, {"Me", u::GeneralCategory::MARK_SPACING_ENCLOSING}, {"Nd", u::GeneralCategory::NUMBER_DIGIT}, @@ -277,6 +277,7 @@ int main(int argc, char** argv) { args->option_argument('p', "prefix", "ARG", "Prefix for exported method"); std::vector<std::string_view> arguments; if (!args->run(argc, argv, &arguments)) { + args->print_error(std::cerr); std::cerr << "Try `gen_u --help` for usage\n"; return 1; } diff --git a/src/grammar.cc b/src/grammar.cc new file mode 100644 index 0000000..25c4d64 --- /dev/null +++ b/src/grammar.cc @@ -0,0 +1,268 @@ +#include "grammar.hh" + +#include "errors.hh" +#include "line.hh" +#include "location.hh" +#include "str.hh" + +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <format> +#include <functional> +#include <map> +#include <memory> +#include <optional> +#include <string> +#include <string_view> +#include <utility> +#include <vector> + +namespace grammar { + +namespace { + +class GrammarImpl : public Grammar { + public: + explicit GrammarImpl(std::vector<std::unique_ptr<Element>> elements) + : elements_(std::move(elements)) {} + + [[nodiscard]] + Element const& root() const override { + return *elements_[0]; + } + + private: + std::vector<std::unique_ptr<Element>> elements_; +}; + +struct FirstPassElement { + src::Location loc; + std::vector<std::string> definitions; + + explicit FirstPassElement(src::Location loc) : loc(loc) {} +}; + +class GrammarLoader { + public: + GrammarLoader(std::unique_ptr<io::Reader> reader, + std::vector<std::string> const& character_classes, + src::Errors& errors) + : reader_(line::open(std::move(reader))), + character_classes_(character_classes), + errors_(errors) {} + + std::unique_ptr<Grammar> load() { + // Read whole file in a first pass, before parsing definitions. + std::map<std::string, FirstPassElement> first_pass_elements; + FirstPassElement* last_element = nullptr; + + std::map<std::string_view, uint8_t, std::less<>> cc_lookup; + for (uint8_t i = 0; i < static_cast<uint8_t>(character_classes_.size()); + ++i) { + cc_lookup.emplace(character_classes_[i], i); + } + + while (true) { + auto line = reader_->read(); + auto loc = src::Location(reader_->number(), 0); + if (!line.has_value()) { + if (line.error() == io::ReadError::Eof) + break; + errors_.err(loc, "Error reading"); + return nullptr; + } + if (line.value().empty() || line.value().front() == '#') + continue; + if (line.value().front() == ' ') { + // Continue on last element + size_t i = 1; + while (i < line.value().size() && line.value()[i] == ' ') + ++i; + if (i == line.value().size()) { + errors_.err(loc, "Unexpected line, only spaces"); + continue; + } + if (last_element == nullptr) { + errors_.err(loc, "Expected element before indented lines"); + continue; + } + last_element->definitions.emplace_back(line.value().substr(i)); + } else { + // New element + if (line.value().back() != ':') { + errors_.err( + loc, + "Unexpected line, not indented but also not ending with a ':'"); + continue; + } + + auto name = line.value().substr(0, line.value().size() - 1); + if (cc_lookup.contains(name)) { + errors_.warn( + loc, std::format("Element {} overrides character class", name)); + } + auto pair = first_pass_elements.emplace(name, FirstPassElement(loc)); + if (!pair.second) { + errors_.err(loc, std::format("Duplicate element {}", name)); + } + last_element = &pair.first->second; + } + } + + if (first_pass_elements.empty()) { + errors_.err(src::Location(reader_->number(), 0), "No elements found"); + return nullptr; + } + + std::vector<std::unique_ptr<Element>> second_pass_elements; + std::map<std::string_view, size_t, std::less<>> second_pass_lookup; + for (auto const& pair : first_pass_elements) { + auto element = std::make_unique<Element>(); + element->name = pair.first; + second_pass_lookup.emplace(element->name, second_pass_elements.size()); + second_pass_elements.emplace_back(std::move(element)); + } + + auto it = second_pass_elements.begin(); + for (auto const& pair : first_pass_elements) { + auto const& element = *it++; + if (pair.second.definitions.empty()) { + errors_.err(pair.second.loc, + std::format("No definitions for {}", pair.first)); + continue; + } + std::vector<std::string_view> in_symbols; + for (auto const& in_definition : pair.second.definitions) { + str::split(in_definition, in_symbols); + + std::vector<Symbol> out_symbols; + bool exclude = false; + bool expect_not = false; + for (auto in_symbol : in_symbols) { + Symbol out_symbol; + if (exclude) { + if (in_symbol == "or") + continue; + out_symbol.optional = Symbol::Optional::kExcluded; + } else if (expect_not) { + expect_not = false; + if (in_symbol == "not") { + exclude = true; + } else { + errors_.err(pair.second.loc, "but is not followed by not"); + } + continue; + } + if (in_symbol == "but") { + expect_not = true; + continue; + } + if (in_symbol.front() == '{' && in_symbol.back() == '}') { + if (exclude) { + errors_.err(pair.second.loc, + "Optional and exclude doesn't work together"); + } else { + out_symbol.optional = Symbol::Optional::kZeroOrMore; + } + in_symbol = in_symbol.substr(1, in_symbol.size() - 2); + } else if (in_symbol.front() == '[' && in_symbol.back() == ']') { + if (exclude) { + errors_.err(pair.second.loc, + "Optional and exclude doesn't work together"); + } else { + out_symbol.optional = Symbol::Optional::kZeroOrOne; + } + in_symbol = in_symbol.substr(1, in_symbol.size() - 2); + } + auto it2 = second_pass_lookup.find(in_symbol); + if (it2 != second_pass_lookup.end()) { + out_symbol.type = Symbol::Type::kNonTerminal; + out_symbol.element = second_pass_elements[it2->second].get(); + } else { + auto it3 = cc_lookup.find(in_symbol); + if (it3 != cc_lookup.end()) { + out_symbol.type = Symbol::Type::kCharacterClass; + out_symbol.char_class = it3->second; + } else { + out_symbol.type = Symbol::Type::kTerminal; + out_symbol.value = in_symbol; + } + } + out_symbols.emplace_back(std::move(out_symbol)); + } + + if (expect_not) { + errors_.err(pair.second.loc, "but is not followed by not"); + } + + if (out_symbols.empty()) { + errors_.err(pair.second.loc, "no symbols found in definition"); + continue; + } + + element->definitions.emplace_back( + Definition{.symbols = std::move(out_symbols)}); + } + } + + // Find root and move it first (if needed) + std::vector<size_t> used(second_pass_elements.size(), 0); + for (auto const& element : second_pass_elements) { + for (auto const& definition : element->definitions) { + for (auto const& symbol : definition.symbols) { + switch (symbol.type) { + case Symbol::Type::kTerminal: + case Symbol::Type::kCharacterClass: + break; + case Symbol::Type::kNonTerminal: + used[second_pass_lookup.find(symbol.element->name)->second]++; + break; + } + } + } + } + + std::optional<size_t> root_index; + for (size_t i = 0; i < used.size(); ++i) { + if (used[i] == 0) { + if (root_index.has_value()) { + errors_.warn(first_pass_elements.find(second_pass_elements[i]->name) + ->second.loc, + "Is not referenced but also not root"); + } else { + root_index = i; + } + } + } + + if (root_index.has_value()) { + if (root_index.value() != 0) { + std::swap(second_pass_elements[0], + second_pass_elements[root_index.value()]); + } + } else { + errors_.err( + first_pass_elements.find(second_pass_elements[0]->name)->second.loc, + "No root element found"); + } + + return std::make_unique<GrammarImpl>(std::move(second_pass_elements)); + } + + private: + std::unique_ptr<line::Reader> reader_; + std::vector<std::string> const& character_classes_; + src::Errors& errors_; +}; + +} // namespace + +std::unique_ptr<Grammar> load(std::unique_ptr<io::Reader> reader, + std::vector<std::string> const& character_classes, + src::Errors& errors) { + GrammarLoader loader(std::move(reader), character_classes, errors); + return loader.load(); +} + +} // namespace grammar diff --git a/src/grammar.hh b/src/grammar.hh new file mode 100644 index 0000000..13beec4 --- /dev/null +++ b/src/grammar.hh @@ -0,0 +1,80 @@ +#ifndef GRAMMAR_HH +#define GRAMMAR_HH + +#include "errors.hh" +#include "io.hh" + +#include <cstdint> +#include <span> +#include <string> +#include <vector> + +namespace grammar { + +struct Element; + +struct Symbol { + enum class Type : uint8_t { + // value == terminal, as UTF-8 + kTerminal, + + // element != nullptr + kNonTerminal, + + // char_class != 0 + kCharacterClass, + }; + + Type type; + + enum class Optional : uint8_t { + // Symbol is NOT optional. + kRequired, + // Symbol is optional. + kZeroOrOne, + // Symbol is optional and can repeat. + kZeroOrMore, + // Symbol should be excluded from previous symbol. + // Example: InputCharacter but not * or / + kExcluded, + }; + + Optional optional{Optional::kRequired}; + + uint8_t char_class{0}; + + Element const* element{nullptr}; + std::string value; +}; + +struct Definition { + std::vector<Symbol> symbols; +}; + +struct Element { + std::string name; + + std::vector<Definition> definitions; +}; + +class Grammar { + public: + virtual ~Grammar() = default; + + [[nodiscard]] + virtual Element const& root() const = 0; + + protected: + Grammar() = default; + + Grammar(Grammar const&) = delete; + Grammar& operator=(Grammar const&) = delete; +}; + +std::unique_ptr<Grammar> load(std::unique_ptr<io::Reader> reader, + std::vector<std::string> const& character_classes, + src::Errors& errors); + +} // namespace grammar + +#endif // GRAMMAR_HH diff --git a/src/java_tokens.cc b/src/java_tokens.cc new file mode 100644 index 0000000..59748c1 --- /dev/null +++ b/src/java_tokens.cc @@ -0,0 +1,374 @@ +#include "java_tokens.hh" + +#include "errors.hh" +#include "java_tokens_java-8.hh" +#include "java_uescape.hh" +#include "str.hh" +#include "u8.hh" +#include "uline.hh" + +#include <cassert> +#include <charconv> +#include <cstddef> +#include <expected> +#include <format> +#include <limits> +#include <memory> +#include <optional> +#include <string> +#include <string_view> +#include <system_error> +#include <utility> + +namespace java { + +namespace { + +template <typename MatchNext, typename MatchToken> +class TokensImpl : public Tokens { + public: + TokensImpl(std::unique_ptr<io::Reader> reader, + std::unique_ptr<src::Errors> errors, TokensConfig config) + : reader_(u8::line::open(u8::java::open(std::move(reader)))), + errors_(std::move(errors)), + config_(config) {} + + std::expected<Token, io::ReadError> read() override { + while (true) { + while (line_.empty()) { + auto maybe_line = reader_->read(); + if (!maybe_line.has_value()) + return std::unexpected(maybe_line.error()); + line_ = maybe_line.value(); + location_.line = reader_->number(); + location_.column = 0; + } + + Token token; + token.loc = location_; + + auto maybe_token_pair = match_next_(line_); + if (maybe_token_pair.has_value()) { + token.str = line_.substr(0, maybe_token_pair->second); + location_.column += u8len(token.str); + line_ = line_.substr(maybe_token_pair->second); + switch (maybe_token_pair->first) { + case MatchToken::kBinaryIntegerLiteral: + handle_int_literal(token, /* base */ 2); + break; + case MatchToken::kBooleanLiteral: + token.type = Token::Type::kLiteralBoolean; + token.int_value = token.str != "false"; + break; + case MatchToken::kCharacterLiteral: + token.type = Token::Type::kLiteralCharacter; + if (token.str[1] == '\\') { + token.int_value = + unescape(token.str.substr(1, token.str.size() - 2)).first; + } else { + auto* ptr = + reinterpret_cast<uint8_t const*>(token.str.data() + 1); + auto* end = ptr + token.str.size() - 2; + token.int_value = u8::read(ptr, end).value(); + } + break; + case MatchToken::kDecimalFloatingPointLiteral: + handle_float_literal(token); + break; + case MatchToken::kDecimalIntegerLiteral: + handle_int_literal(token); + break; + case MatchToken::kEndOfLineComment: + token.type = Token::Type::kComment; + token.str = str::trim(token.str.substr(2)); + break; + case MatchToken::kHexIntegerLiteral: + handle_int_literal(token, /* base */ 16); + break; + case MatchToken::kHexadecimalFloatingPointLiteral: + handle_float_literal(token, /* base */ 16); + break; + case MatchToken::kIdentifier: + token.type = Token::Type::kIdentifier; + break; + case MatchToken::kKeyword: + token.type = Token::Type::kKeyword; + break; + case MatchToken::kNullLiteral: + token.type = Token::Type::kLiteralNull; + break; + case MatchToken::kOctalIntegerLiteral: + handle_int_literal(token, /* base */ 8); + break; + case MatchToken::kOperator: + token.type = Token::Type::kOperator; + break; + case MatchToken::kSeparator: + token.type = Token::Type::kSeparator; + break; + case MatchToken::kStringLiteral: + token.type = Token::Type::kLiteralString; + token.str = + unescape_if_needed(token.str.substr(1, token.str.size() - 2)); + break; + case MatchToken::kTraditionalComment: { + token.type = Token::Type::kComment; + size_t s = 2; + while (s < token.str.size() && token.str[s] == '*') + ++s; + token.str = + str::trim(token.str.substr(s, token.str.size() - 2 - s)); + token.int_value = static_cast<int64_t>(s - 1); + // TODO: handle multiline + break; + } + case MatchToken::kWhiteSpace: + continue; + } + } else { + errors_->err(location_, std::format("Invalid token: {}", line_)); + token.type = Token::Type::kError; + token.str = line_; + } + return token; + } + } + + private: + void handle_int_literal_error(Token& token, std::string_view str, + std::errc err, int base) { + if (err == std::errc::result_out_of_range) { + // Java assumes two completent (so 0xffff_ffff is -1) and also, negative literals + // are read as positive (because the operator '-' is a separate token) + uint64_t tmp; + auto ret = + std::from_chars(str.data(), str.data() + str.size(), tmp, base); + if (ret.ec == std::errc()) { + token.type = ret.ptr < str.data() + str.size() + ? Token::Type::kLiteralLong + : Token::Type::kLiteralInt; + token.int_value = static_cast<int64_t>(tmp); + return; + } + } + errors_->err(location_, + std::format("Invalid integer literal: {}", token.str)); + token.type = Token::Type::kError; + } + + void handle_int_literal(Token& token, int base = 10) { + size_t prefix; + switch (base) { + case 16: // 0x + case 2: // 0b + prefix = 2; + break; + case 8: // 0 + prefix = 1; + break; + default: + prefix = 0; + break; + } + std::optional<char> suffix; + if (token.str.find('_') == std::string_view::npos) { + auto ret = std::from_chars(token.str.data() + prefix, + token.str.data() + token.str.size(), + token.int_value, base); + if (ret.ec != std::errc()) { + handle_int_literal_error(token, token.str.substr(prefix), ret.ec, base); + return; + } + if (ret.ptr < token.str.data() + token.str.size()) + suffix = *ret.ptr; + } else { + std::string tmp; + tmp.reserve(token.str.size() - prefix); + for (size_t i = prefix; i < token.str.size(); ++i) { + if (token.str[i] != '_') { + tmp.push_back(token.str[i]); + } + } + auto ret = std::from_chars(tmp.data(), tmp.data() + tmp.size(), + token.int_value, base); + if (ret.ec != std::errc()) { + handle_int_literal_error(token, tmp, ret.ec, base); + return; + } + if (ret.ptr < tmp.data() + tmp.size()) + suffix = *ret.ptr; + } + if (suffix.has_value() && + (suffix.value() == 'l' || suffix.value() == 'L')) { + token.type = Token::Type::kLiteralLong; + } else { + if (base == 10 && + token.int_value > + static_cast<int64_t>(1) + std::numeric_limits<int32_t>::max()) { + errors_->err(location_, + std::format("Invalid integer literal: {}", token.str)); + token.type = Token::Type::kError; + return; + } + if (std::cmp_greater(token.int_value, + std::numeric_limits<uint32_t>::max())) { + errors_->err(location_, + std::format("Invalid integer literal: {}", token.str)); + token.type = Token::Type::kError; + return; + } + token.type = Token::Type::kLiteralInt; + token.int_value = static_cast<int32_t>(token.int_value); + } + } + + void handle_float_literal(Token& token, int base = 10) { + size_t prefix; + std::chars_format fmt; + switch (base) { + case 16: // 0x + fmt = std::chars_format::general | std::chars_format::hex; + prefix = 2; + break; + default: + fmt = std::chars_format::general; + prefix = 0; + break; + } + + std::from_chars_result ret; + if (token.str.ends_with("f") || token.str.ends_with("F")) { + // float and double do not parse exactly the same, so use a float parser for float. + float tmp; + ret = std::from_chars(token.str.data() + prefix, + token.str.data() + token.str.size(), tmp, fmt); + token.type = Token::Type::kLiteralFloatingPoint; + token.float_value = tmp; + } else { + ret = std::from_chars(token.str.data() + prefix, + token.str.data() + token.str.size(), + token.float_value, fmt); + token.type = Token::Type::kLiteralDoubleFloatingPoint; + } + + if (ret.ec != std::errc()) { + // Java allows 0 with just a suffix, std::from_chars does not. + if (token.str == "0f" || token.str == "0F") { + token.type = Token::Type::kLiteralFloatingPoint; + token.float_value = 0; + return; + } + if (token.str == "0d" || token.str == "0D") { + token.type = Token::Type::kLiteralDoubleFloatingPoint; + token.float_value = 0; + return; + } + errors_->err(location_, + std::format("Invalid float literal: {}", token.str)); + token.type = Token::Type::kError; + } + } + + std::string_view unescape_if_needed(std::string_view str) { + auto back_slash = str.find('\\'); + if (back_slash == std::string_view::npos) + return str; + unescape_tmp_.clear(); + unescape_tmp_.reserve(str.size()); + size_t last = 0; + uint8_t tmp[4]; + while (true) { + unescape_tmp_.append(str, last, back_slash - last); + auto ret = unescape(str.substr(back_slash)); + auto* ptr = tmp; + u8::write(ptr, tmp + sizeof(tmp), ret.first); + unescape_tmp_.append( + std::string_view(reinterpret_cast<char*>(tmp), ptr - tmp)); + last = back_slash + ret.second; + back_slash = str.find('\\', last); + if (back_slash == std::string::npos) { + unescape_tmp_.append(str, last); + break; + } + } + return unescape_tmp_; + } + + // Strings coming here have already been validated by the tokenizer + static std::pair<uint16_t, size_t> unescape(std::string_view in) { + assert(in.front() == '\\'); + assert(in.size() > 1); + switch (in[1]) { + case 'b': + return std::make_pair(8, 2); + case 't': + return std::make_pair(9, 2); + case 'n': + return std::make_pair(10, 2); + case 'f': + return std::make_pair(12, 2); + case 'r': + return std::make_pair(13, 2); + case '"': + return std::make_pair(34, 2); + case '\'': + return std::make_pair(39, 2); + case '\\': + return std::make_pair(92, 2); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': { + uint8_t tmp; + auto ret = std::from_chars(in.data() + 1, in.data() + in.size(), tmp, + /* base */ 8); + return std::make_pair(tmp, ret.ptr - in.data()); + } + default: + std::unreachable(); + } + } + + static size_t u8len(std::string_view str) { + auto* ptr = reinterpret_cast<uint8_t const*>(str.data()); + auto* const end = ptr + str.size(); + size_t count = 0; + while (u8::skip(ptr, end)) + ++count; + return count; + } + + std::unique_ptr<u8::line::Reader> reader_; + std::unique_ptr<src::Errors> errors_; + TokensConfig const config_; + MatchNext match_next_; + std::string_view line_; + Location location_; + std::string unescape_tmp_; +}; + +struct MatchNextJava8 { + std::optional<std::pair<java_8::Token, size_t>> operator()( + std::string_view str) const { + return java_8::matchNext(str); + } +}; + +} // namespace + +std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader, + std::unique_ptr<src::Errors> errors, + TokensConfig config) { + switch (config.version) { + case Version::kJava8: + return std::make_unique<TokensImpl<MatchNextJava8, java_8::Token>>( + std::move(reader), std::move(errors), config); + } + std::unreachable(); +} + +} // namespace java diff --git a/src/java_tokens.hh b/src/java_tokens.hh new file mode 100644 index 0000000..6fbefcb --- /dev/null +++ b/src/java_tokens.hh @@ -0,0 +1,92 @@ +#ifndef JAVA_TOKENS_HH +#define JAVA_TOKENS_HH + +#include "errors.hh" +#include "io.hh" +#include "java_version.hh" // IWYU pragma: export +#include "location.hh" + +#include <expected> +#include <memory> +#include <string_view> + +namespace java { + +using src::Location; + +struct Token { + enum class Type : uint8_t { + // str is content of comment, excluding heading or trailing slash and star. + // int_value is number of stars at head, 0 for single line, 1 for /* and + // 2 for /** and so on. + kComment, + + // str is identifier + kIdentifier, + + // str is keyword, int_value is Keyword index + kKeyword, + + // str is separator, int_value is Separator index + kSeparator, + + // str is operator, int_value is Operator index + kOperator, + + // int_value is literal value + kLiteralInt, + + // int_value is literal value + kLiteralLong, + + // str is literal value + kLiteralString, + + // int_value is literal value as unicode code-point + kLiteralCharacter, + + kLiteralNull, + + // float_value is literal value + kLiteralFloatingPoint, + + // float_value is literal value + kLiteralDoubleFloatingPoint, + + // int_value is literal value, 0 = false, anything else = true + kLiteralBoolean, + + kError, + }; + + Type type; + Location loc; + std::string_view str; + int64_t int_value{0}; + double float_value{0}; +}; + +struct TokensConfig { + // Source version of Java file + Version version = Version::kMax; +}; + +class Tokens { + public: + virtual ~Tokens() = default; + + virtual std::expected<Token, io::ReadError> read() = 0; + + protected: + Tokens() = default; + Tokens(Tokens const&) = delete; + Tokens& operator=(Tokens const&) = delete; +}; + +[[nodiscard]] std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader, + std::unique_ptr<src::Errors>, + TokensConfig config = {}); + +} // namespace java + +#endif // JAVA_TOKENS_HH diff --git a/src/java_version.hh b/src/java_version.hh new file mode 100644 index 0000000..444ae36 --- /dev/null +++ b/src/java_version.hh @@ -0,0 +1,16 @@ +#ifndef JAVA_VERSION_HH +#define JAVA_VERSION_HH + +#include <cstdint> + +namespace java { + +enum class Version : uint8_t { + kJava8 = 8, + + kMax = kJava8, +}; + +} // namespace java + +#endif // JAVA_VERSION_HH diff --git a/src/location.cc b/src/location.cc new file mode 100644 index 0000000..3fa1075 --- /dev/null +++ b/src/location.cc @@ -0,0 +1,12 @@ +#include "location.hh" + +#include <ostream> + +namespace src { + +std::ostream& operator<<(std::ostream& out, Location const& loc) { + out << loc.line << ':' << loc.column; + return out; +} + +} // namespace src diff --git a/src/location.hh b/src/location.hh new file mode 100644 index 0000000..1a210cb --- /dev/null +++ b/src/location.hh @@ -0,0 +1,31 @@ +#ifndef LOCATION_HH +#define LOCATION_HH + +#include <compare> +#include <cstdint> +#include <iosfwd> + +namespace src { + +struct Location { + uint64_t line; + uint16_t column; + + constexpr Location() : line(0), column(0) {} + + Location(uint64_t line, uint16_t column) : line(line), column(column) {} + + [[nodiscard]] + std::strong_ordering operator<=>(Location const& loc) const { + auto ret = line <=> loc.line; + if (ret == std::strong_ordering::equal) + return column <=> loc.column; + return ret; + } +}; + +std::ostream& operator<<(std::ostream& out, Location const& loc); + +} // namespace src + +#endif // LOCATION_HH diff --git a/src/prefix_tree.cc b/src/prefix_tree.cc index f16df22..56466e8 100644 --- a/src/prefix_tree.cc +++ b/src/prefix_tree.cc @@ -42,6 +42,17 @@ class BuilderImpl : public Builder { [[nodiscard]] std::optional<std::string> build() const override { + auto ret = build(1); + if (!ret.has_value()) + ret = build(2); + return ret; + } + + private: + [[nodiscard]] + std::optional<std::string> build(uint8_t size) const { + assert(size > 0); + std::string tree; std::string strings; @@ -49,20 +60,19 @@ class BuilderImpl : public Builder { for (auto const& str : strings_) tmp.emplace(str); - if (!write_tree(tmp, tree, strings)) + if (!write_tree(tmp, tree, strings, size)) return std::nullopt; std::string header; - if (tree.size() > 0xffff) + write_u8(header, size); + if (!write_size(header, size, tree.size())) return std::nullopt; - write_u16(header, tree.size()); return header + tree + strings; } - private: bool write_tree(std::set<std::string_view> const& input, std::string& tree, - std::string& strings) const { + std::string& strings, uint8_t size) const { std::map<char, std::set<std::string_view>> buckets; bool match = false; for (auto& str : input) { @@ -76,8 +86,8 @@ class BuilderImpl : public Builder { write_u8(tree, buckets.size() + (match ? 1 : 0)); if (match) { write_u8(tree, 0); - write_u16(tree, 0); - write_u16(tree, 0); + write_size(tree, size, 0); + write_size(tree, size, 0); } std::string extra; for (auto& pair : buckets) { @@ -95,23 +105,23 @@ class BuilderImpl : public Builder { } write_u8(tree, 1 + str.size()); - if (strings.size() > 0xffff) + if (!write_size(tree, size, strings.size())) return false; - write_u16(tree, strings.size()); strings.push_back(pair.first); strings.append(str); if (extra.size() > 0xffff) return false; - write_u16(tree, extra.size()); + if (!write_size(tree, size, extra.size())) + return false; if (str.empty()) { - if (!write_tree(pair.second, extra, strings)) + if (!write_tree(pair.second, extra, strings, size)) return false; } else { std::set<std::string_view> tmp; for (auto& str2 : pair.second) { tmp.emplace(str2.substr(str.size())); } - if (!write_tree(tmp, extra, strings)) + if (!write_tree(tmp, extra, strings, size)) return false; } } @@ -120,12 +130,27 @@ class BuilderImpl : public Builder { return true; } + static bool write_size(std::string& str, uint8_t size, size_t value) { + if (size == 1) { + if (value > 0xff) + return false; + write_u8(str, value); + return true; + } + if (size == 2) { + if (value > 0xffff) + return false; + write_u16(str, value); + return true; + } + assert(false); + return false; + } + std::set<std::string> strings_; }; -} // namespace - -std::optional<size_t> lookup(std::string_view tree, std::string_view str) { +std::optional<size_t> lookup16(std::string_view tree, std::string_view str) { size_t base_str = 2 + get_u16(tree, 0); size_t node = 2; std::optional<size_t> match; @@ -168,6 +193,63 @@ std::optional<size_t> lookup(std::string_view tree, std::string_view str) { return match; } +std::optional<size_t> lookup8(std::string_view tree, std::string_view str) { + size_t base_str = 1 + get_u8(tree, 0); + size_t node = 1; + std::optional<size_t> match; + std::optional<size_t> earlier_match; + + while (node < base_str && !str.empty()) { + auto children = get_u8(tree, node); + + if (children == 0) { + // Leaf + return match; + } + + size_t child_node = node + 1; + size_t child_end = child_node + (static_cast<size_t>(children) * 3); + for (; child_node < child_end; child_node += 3) { + uint8_t len = get_u8(tree, child_node); + uint8_t offset = get_u8(tree, child_node + 1); + + if (str.starts_with(tree.substr(base_str + offset, len))) { + // Match but not a leaf, always first in the list of children + if (len == 0) { + earlier_match = match; + continue; + } + match = match.value_or(0) + len; + str = str.substr(len); + auto jump = get_u8(tree, child_node + 2); + node = child_end + jump; + break; + } + } + + if (child_node == child_end) + return earlier_match; + } + + if (node == base_str) + return earlier_match; + return match; +} + +} // namespace + +std::optional<size_t> lookup(std::string_view tree, std::string_view str) { + auto size = get_u8(tree, 0); + if (size == 1) { + return lookup8(tree.substr(1), str); + } + if (size == 2) { + return lookup16(tree.substr(1), str); + } + assert(false); + return std::nullopt; +} + std::unique_ptr<Builder> builder() { return std::make_unique<BuilderImpl>(); } } // namespace prefix_tree @@ -6,6 +6,15 @@ namespace str { +namespace { + +[[nodiscard]] +inline bool is_space(char c) { + return c == ' ' || c == '\t' || c == '\r' || c == '\n'; +} + +} // namespace + void split(std::string_view str, std::vector<std::string_view>& out, char separator, bool keep_empty) { out.clear(); @@ -31,4 +40,14 @@ std::vector<std::string_view> split(std::string_view str, char separator, return vec; } +std::string_view trim(std::string_view str) { + size_t s = 0; + size_t e = str.size(); + while (s < e && is_space(str[s])) + ++s; + while (e > s && is_space(str[e - 1])) + --e; + return str.substr(s, e - s); +} + } // namespace str @@ -13,6 +13,9 @@ void split(std::string_view str, std::vector<std::string_view>& out, char separator = ' ', bool keep_empty = false); +[[nodiscard]] +std::string_view trim(std::string_view str); + } // namespace str #endif // STR_HH @@ -13,7 +13,7 @@ enum class GeneralCategory : uint8_t { LETTER_OTHER, MARK_NONSPACING, - MARK_SPACING_COMBINDING, + MARK_SPACING_COMBINING, MARK_SPACING_ENCLOSING, NUMBER_DIGIT, diff --git a/test/java_tokens.cc b/test/java_tokens.cc new file mode 100644 index 0000000..df37409 --- /dev/null +++ b/test/java_tokens.cc @@ -0,0 +1,551 @@ +#include "java_tokens.hh" + +#include "errors.hh" +#include "io.hh" + +#include <gtest/gtest.h> +#include <memory> +#include <string_view> +#include <utility> + +using namespace std::literals::string_view_literals; + +namespace { + +class JavaTokens : public testing::TestWithParam<java::Version> { + protected: + static std::unique_ptr<src::Errors> make_errors() { + return src::file_errors( + testing::UnitTest::GetInstance()->current_test_info()->name()); + } +}; + +} // namespace + +TEST_P(JavaTokens, empty_class) { + auto input = io::memory(R"(class Empty { +})"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kKeyword, ret->type); + EXPECT_EQ("class", ret->str); + EXPECT_EQ(1, ret->loc.line); + EXPECT_EQ(0, ret->loc.column); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kIdentifier, ret->type); + EXPECT_EQ("Empty", ret->str); + EXPECT_EQ(1, ret->loc.line); + EXPECT_EQ(6, ret->loc.column); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kSeparator, ret->type); + EXPECT_EQ("{", ret->str); + EXPECT_EQ(1, ret->loc.line); + EXPECT_EQ(12, ret->loc.column); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kSeparator, ret->type); + EXPECT_EQ("}", ret->str); + EXPECT_EQ(2, ret->loc.line); + EXPECT_EQ(0, ret->loc.column); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +TEST_P(JavaTokens, traditional_comment) { + auto input = io::memory(R"(/* this comment /* // /** ends here: */)"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kComment, ret->type); + EXPECT_EQ("this comment /* // /** ends here:", ret->str); + EXPECT_EQ(1, ret->int_value); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +TEST_P(JavaTokens, single_line_comment) { + auto input = io::memory(R"(// this is a comment)"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kComment, ret->type); + EXPECT_EQ("this is a comment", ret->str); + EXPECT_EQ(0, ret->int_value); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +TEST_P(JavaTokens, identifiers) { + auto input = io::memory(R"(String i3 αρετη MAX_VALUE isLetterOrDigit)"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kIdentifier, ret->type); + EXPECT_EQ("String", ret->str); + EXPECT_EQ(1, ret->loc.line); + EXPECT_EQ(0, ret->loc.column); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kIdentifier, ret->type); + EXPECT_EQ("i3", ret->str); + EXPECT_EQ(1, ret->loc.line); + EXPECT_EQ(7, ret->loc.column); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kIdentifier, ret->type); + EXPECT_EQ("αρετη", ret->str); + EXPECT_EQ(1, ret->loc.line); + EXPECT_EQ(10, ret->loc.column); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kIdentifier, ret->type); + EXPECT_EQ("MAX_VALUE", ret->str); + EXPECT_EQ(1, ret->loc.line); + EXPECT_EQ(16, ret->loc.column); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kIdentifier, ret->type); + EXPECT_EQ("isLetterOrDigit", ret->str); + EXPECT_EQ(1, ret->loc.line); + EXPECT_EQ(26, ret->loc.column); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +TEST_P(JavaTokens, int_literals) { + auto input = io::memory(R"( +0 2 0372 0xDada_Cafe 1996 0x00_FF__00_FF +)"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(0L, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(2L, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(250L, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(-623195394L, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(1996L, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(16711935L, ret->int_value); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +TEST_P(JavaTokens, long_literals) { + auto input = io::memory(R"( +0l 0777L 0x100000000L 2_147_483_648L 0xC0B0L +)"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(0, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(511, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(4294967296LL, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(2147483648LL, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(49328, ret->int_value); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +TEST_P(JavaTokens, int_literal_min_max) { + auto input = io::memory(R"( +2147483647 +-2147483648 +0x7fff_ffff +0177_7777_7777 +0b0111_1111_1111_1111_1111_1111_1111_1111 +0x8000_0000 +0200_0000_0000 +0b1000_0000_0000_0000_0000_0000_0000_0000 +0xffff_ffff +0377_7777_7777 +0b1111_1111_1111_1111_1111_1111_1111_1111 +)"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(2147483647L, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kOperator, ret->type); + EXPECT_EQ("-", ret->str); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(-2147483647L - 1, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(2147483647L, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(2147483647L, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(2147483647L, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(-2147483647L - 1, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(-2147483647L - 1, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(-2147483647L - 1, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(-1L, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(-1L, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralInt, ret->type); + EXPECT_EQ(-1L, ret->int_value); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +TEST_P(JavaTokens, long_literal_min_max) { + auto input = io::memory(R"( +9223372036854775807L +-9223372036854775808L +0x7fff_ffff_ffff_ffffL +07_7777_7777_7777_7777_7777L +0b0111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111L +0x8000_0000_0000_0000L +010_0000_0000_0000_0000_0000L +0b1000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000L +0xffff_ffff_ffff_ffffL +017_7777_7777_7777_7777_7777L +0b1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111_1111L +)"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(9223372036854775807LL, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kOperator, ret->type); + EXPECT_EQ("-", ret->str); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(-9223372036854775807LL - 1, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(9223372036854775807LL, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(9223372036854775807LL, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(9223372036854775807LL, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(-9223372036854775807LL - 1, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(-9223372036854775807LL - 1, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(-9223372036854775807LL - 1, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(-1LL, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(-1LL, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralLong, ret->type); + EXPECT_EQ(-1LL, ret->int_value); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +TEST_P(JavaTokens, float_literals) { + auto input = io::memory(R"( +1e1f 2.f .3f 0f 3.14f 6.022137e+23f +)"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type); + EXPECT_EQ(1e1F, ret->float_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type); + EXPECT_EQ(2.F, ret->float_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type); + EXPECT_EQ(.3F, ret->float_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type); + EXPECT_EQ(0.F, ret->float_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type); + EXPECT_EQ(3.14F, ret->float_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralFloatingPoint, ret->type); + EXPECT_EQ(6.022137e+23F, ret->float_value); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +TEST_P(JavaTokens, double_literals) { + auto input = io::memory(R"( +1e1 2. .3 0.0 3.14 1e-9d 1e137 +)"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type); + EXPECT_EQ(1e1, ret->float_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type); + EXPECT_EQ(2., ret->float_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type); + EXPECT_EQ(.3, ret->float_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type); + EXPECT_EQ(0.0, ret->float_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type); + EXPECT_EQ(3.14, ret->float_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type); + EXPECT_EQ(1e-9, ret->float_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralDoubleFloatingPoint, ret->type); + EXPECT_EQ(1e137, ret->float_value); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +TEST_P(JavaTokens, bool_literals) { + auto input = io::memory(R"( +true false +)"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralBoolean, ret->type); + EXPECT_TRUE(ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralBoolean, ret->type); + EXPECT_EQ(0, ret->int_value); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +TEST_P(JavaTokens, char_literals) { + auto input = io::memory(R"( +'a' + +'%' + +'\t' + +'\\' + +'\'' + +'\u03a9' + +'\uFFFF' + +'\177' + +'™' +)"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type); + EXPECT_EQ('a', ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type); + EXPECT_EQ('%', ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type); + EXPECT_EQ('\t', ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type); + EXPECT_EQ('\\', ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type); + EXPECT_EQ('\'', ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type); + EXPECT_EQ(0x3a9, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type); + EXPECT_EQ(0xffff, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type); + EXPECT_EQ(0177, ret->int_value); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralCharacter, ret->type); + EXPECT_EQ(0x2122, ret->int_value); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +TEST_P(JavaTokens, string_literals) { + auto input = io::memory(R"( +"" +"\"" +"This is a string" +"This is a " + + "two-line string" +)"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralString, ret->type); + EXPECT_EQ("", ret->str); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralString, ret->type); + EXPECT_EQ(R"(")", ret->str); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralString, ret->type); + EXPECT_EQ("This is a string", ret->str); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralString, ret->type); + EXPECT_EQ("This is a ", ret->str); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kOperator, ret->type); + EXPECT_EQ("+", ret->str); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralString, ret->type); + EXPECT_EQ("two-line string", ret->str); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +TEST_P(JavaTokens, string_escapes) { + auto input = io::memory(R"("\b\t\n\f\r\"\'\\\0\1\177")"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralString, ret->type); + EXPECT_EQ("\b\t\n\f\r\"'\\\0\1\177"sv, ret->str); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +TEST_P(JavaTokens, null) { + auto input = io::memory("null"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralNull, ret->type); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); +} + +INSTANTIATE_TEST_SUITE_P(AllVersions, JavaTokens, + testing::Values(java::Version::kJava8)); diff --git a/test/java_uescape.cc b/test/java_uescape.cc index a6657d8..a9f6746 100644 --- a/test/java_uescape.cc +++ b/test/java_uescape.cc @@ -132,6 +132,15 @@ TEST(java_uescape_u8, bad_surrogate) { EXPECT_EQ(io::ReadError::InvalidData, ret.error()); } +TEST(java_uescape_u8, issue) { + auto reader = u8::line::open(u8::java::open(io::memory(R"('\u03a9')"))); + auto ret = reader->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ("'Ω'", ret.value()); + ret = reader->read(); + EXPECT_FALSE(ret.has_value()); +} + TEST(java_uescape_u16, escaped_escape) { auto reader = u16::java::open(io::memory(R"(\\u2122=\u2122)")); std::u16string tmp; diff --git a/test/prefix_tree.cc b/test/prefix_tree.cc index 6c00adb..86c8990 100644 --- a/test/prefix_tree.cc +++ b/test/prefix_tree.cc @@ -1,5 +1,7 @@ #include "prefix_tree.hh" +#include "str.hh" + #include <gtest/gtest.h> TEST(prefix_tree, empty) { @@ -45,3 +47,27 @@ TEST(prefix_tree, sanity) { ASSERT_TRUE(ret.has_value()); EXPECT_EQ(3, ret.value()); } + +TEST(prefix_tree, many_and_long) { + auto builder = prefix_tree::builder(); + for (auto str : str::split( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do " + "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut " + "enim ad minim veniam, quis nostrud exercitation ullamco laboris " + "nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in " + "reprehenderit in voluptate velit esse cillum dolore eu fugiat " + "nulla pariatur. Excepteur sint occaecat cupidatat non proident, " + "sunt in culpa qui officia deserunt mollit anim id est laborum.")) { + builder->add(str); + } + auto tree = builder->build(); + ASSERT_TRUE(tree.has_value()); + auto ret = prefix_tree::lookup(tree.value(), ""); + EXPECT_FALSE(ret.has_value()); + ret = prefix_tree::lookup(tree.value(), "Lorem"); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(5, ret.value()); + ret = prefix_tree::lookup(tree.value(), "cillum"); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(6, ret.value()); +} @@ -711,7 +711,7 @@ TEST_P(UnicodeVersionTest, lookup_gc) { EXPECT_EQ(u::lookup_gc(0x483, GetParam()), u::GeneralCategory::MARK_NONSPACING); EXPECT_EQ(u::lookup_gc(0x93b, GetParam()), - u::GeneralCategory::MARK_SPACING_COMBINDING); + u::GeneralCategory::MARK_SPACING_COMBINING); EXPECT_EQ(u::lookup_gc(0x20de, GetParam()), u::GeneralCategory::MARK_SPACING_ENCLOSING); |
