diff options
| author | Joel Klinghed <the_jk@spawned.biz> | 2025-09-29 09:39:49 +0200 |
|---|---|---|
| committer | Joel Klinghed <the_jk@spawned.biz> | 2025-09-29 09:50:47 +0200 |
| commit | d196d51e07f50f3510c43ad375c5559b58860023 (patch) | |
| tree | 3432b8e99e306d0ece9f29ddad1e2945f88a1481 | |
| parent | 1e9e51dae1c01bab7562911b958c47528b8011c8 (diff) | |
java: Add tokens support for Java 21
Some new keywords, I opted to modify java-8 grammar to use the new
names, even if they are not going to match anything. Makes the
tokenizer easier to write.
| -rw-r--r-- | data/java-21/tokens.grammar | 423 | ||||
| -rw-r--r-- | data/java-8/tokens.grammar | 12 | ||||
| -rw-r--r-- | meson.build | 2 | ||||
| -rw-r--r-- | src/gen_tokens.cc | 78 | ||||
| -rw-r--r-- | src/grammar.cc | 37 | ||||
| -rw-r--r-- | src/java_tokens.cc | 61 | ||||
| -rw-r--r-- | src/java_tokens.hh | 5 | ||||
| -rw-r--r-- | src/java_version.hh | 3 | ||||
| -rw-r--r-- | test/java_tokens.cc | 56 |
9 files changed, 654 insertions, 23 deletions
diff --git a/data/java-21/tokens.grammar b/data/java-21/tokens.grammar new file mode 100644 index 0000000..db935b2 --- /dev/null +++ b/data/java-21/tokens.grammar @@ -0,0 +1,423 @@ +InputElement: + WhiteSpace + Comment + Token + +Token: + Identifier + Keyword + Literal + Separator + Operator + +Comment: + TraditionalComment + EndOfLineComment + +TraditionalComment: + / * CommentTail + +CommentTail: + * CommentTailStar + NotStar CommentTail + +CommentTailStar: + / + * CommentTailStar + NotStarNotSlash CommentTail + +NotStar: + InputCharacter but not * + LineTerminator + +NotStarNotSlash: + InputCharacter but not * or / + LineTerminator + +EndOfLineComment: + / / {InputCharacter} + +Identifier: + IdentifierChars but not a ReservedKeyword or BooleanLiteral or NullLiteral + +IdentifierChars: + JavaLetter {JavaLetterOrDigit} + +Keyword: + ReservedKeyword + ContextualKeyword + +ReservedKeyword: + abstract + assert + boolean + break + byte + case + catch + char + class + const + continue + default + do + double + else + enum + extends + final + finally + float + for + goto + if + implements + import + instanceof + int + interface + long + native + new + package + private + protected + public + return + short + static + strictfp + super + switch + synchronized + this + throw + throws + transient + try + void + volatile + while + _ + +ContextualKeyword: + exports + module + non-sealed + open + opens + permits + provides + record + requires + sealed + to + transitive + uses + var + when + with + yield + +Literal: + IntegerLiteral + FloatingPointLiteral + BooleanLiteral + CharacterLiteral + StringLiteral + TextBlock + NullLiteral + +IntegerLiteral: + DecimalIntegerLiteral + HexIntegerLiteral + OctalIntegerLiteral + BinaryIntegerLiteral + +DecimalIntegerLiteral: + DecimalNumeral [IntegerTypeSuffix] + +HexIntegerLiteral: + HexNumeral [IntegerTypeSuffix] + +OctalIntegerLiteral: + OctalNumeral [IntegerTypeSuffix] + +BinaryIntegerLiteral: + BinaryNumeral [IntegerTypeSuffix] + +IntegerTypeSuffix: + l + L + +DecimalNumeral: + 0 + NonZeroDigit [Digits] + NonZeroDigit Underscores Digits + +NonZeroDigit: + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + +Digits: + Digit + Digit [DigitsAndUnderscores] Digit + +Digit: + 0 + NonZeroDigit + +DigitsAndUnderscores: + DigitOrUnderscore {DigitOrUnderscore} + +DigitOrUnderscore: + Digit + _ + +Underscores: + _ {_} + +HexNumeral: + 0 x HexDigits + 0 X HexDigits + +HexDigits: + HexDigit + HexDigit [HexDigitsAndUnderscores] HexDigit + +HexDigit: + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + a + b + c + d + e + f + A + B + C + D + E + F + +HexDigitsAndUnderscores: + HexDigitOrUnderscore {HexDigitOrUnderscore} + +HexDigitOrUnderscore: + HexDigit + _ + +OctalNumeral: + 0 OctalDigits + 0 Underscores OctalDigits + +OctalDigits: + OctalDigit + OctalDigit [OctalDigitsAndUnderscores] OctalDigit + +OctalDigit: + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + +OctalDigitsAndUnderscores: + OctalDigitOrUnderscore {OctalDigitOrUnderscore} + +OctalDigitOrUnderscore: + OctalDigit + _ + +BinaryNumeral: + 0 b BinaryDigits + 0 B BinaryDigits + +BinaryDigits: + BinaryDigit + BinaryDigit [BinaryDigitsAndUnderscores] BinaryDigit + +BinaryDigit: + 0 + 1 + +BinaryDigitsAndUnderscores: + BinaryDigitOrUnderscore {BinaryDigitOrUnderscore} + +BinaryDigitOrUnderscore: + BinaryDigit + _ + +FloatingPointLiteral: + DecimalFloatingPointLiteral + HexadecimalFloatingPointLiteral + +DecimalFloatingPointLiteral: + Digits . [Digits] [ExponentPart] [FloatTypeSuffix] + . Digits [ExponentPart] [FloatTypeSuffix] + Digits ExponentPart [FloatTypeSuffix] + Digits [ExponentPart] FloatTypeSuffix + +ExponentPart: + ExponentIndicator SignedInteger + +ExponentIndicator: + e + E + +SignedInteger: + [Sign] Digits + +Sign: + + + - + +FloatTypeSuffix: + f + F + d + D + +HexadecimalFloatingPointLiteral: + HexSignificand BinaryExponent [FloatTypeSuffix] + +HexSignificand: + HexNumeral [.] + 0 x [HexDigits] . HexDigits + 0 X [HexDigits] . HexDigits + +BinaryExponent: + BinaryExponentIndicator SignedInteger + +BinaryExponentIndicator: + p + P + +BooleanLiteral: + true + false + +CharacterLiteral: + ' SingleCharacter ' + ' EscapeSequence ' + +SingleCharacter: + InputCharacter but not ' or \ + +StringLiteral: + " {StringCharacter} " + +StringCharacter: + InputCharacter but not " or \ + EscapeSequence + +TextBlock: + " " " {TextBlockWhiteSpace} LineTerminator {TextBlockCharacter} " " " + +TextBlockWhiteSpace: + WhiteSpace but not LineTerminator + +TextBlockCharacter: + InputCharacter but not \ + EscapeSequence + LineTerminator + +EscapeSequence: + \ b + \ s + \ t + \ n + \ f + \ r + \ LineTerminator + \ " + \ ' + \ \ + OctalEscape + +OctalEscape: + \ OctalDigit + \ OctalDigit OctalDigit + \ ZeroToThree OctalDigit OctalDigit + +ZeroToThree: + 0 + 1 + 2 + 3 + +NullLiteral: + null + +Separator: + ( + ) + { + } + [ + ] + ; + , + . + ... + @ + :: + +Operator: + = + > + < + ! + ~ + ? + : + -> + == + >= + <= + != + && + || + ++ + -- + + + - + * + / + & + | + ^ + % + << + >> + >>> + += + -= + *= + /= + &= + |= + ^= + %= + <<= + >>= + >>>= diff --git a/data/java-8/tokens.grammar b/data/java-8/tokens.grammar index 3521ac0..3941b94 100644 --- a/data/java-8/tokens.grammar +++ b/data/java-8/tokens.grammar @@ -43,7 +43,13 @@ Identifier: IdentifierChars: JavaLetter {JavaLetterOrDigit} +# Java 8 only has reserved keywords, but use modern names +# here to make a shared tokenizer simpler. Keyword: + ReservedKeyword + ContextualKeyword + +ReservedKeyword: abstract continue for @@ -95,14 +101,20 @@ Keyword: super while +ContextualKeyword: + Literal: IntegerLiteral FloatingPointLiteral BooleanLiteral CharacterLiteral StringLiteral + TextBlock NullLiteral +# Java 8 doesn't have TextBlock, but add it as newer grammers have it +TextBlock: + IntegerLiteral: DecimalIntegerLiteral HexIntegerLiteral diff --git a/meson.build b/meson.build index 7782285..a8f4b97 100644 --- a/meson.build +++ b/meson.build @@ -263,10 +263,12 @@ gen_tokens = executable( java_versions = [ 'java-8', + 'java-21', ] java_unicode_versions = { 'java-8': '6.2.0', + 'java-21': '15.0.0', } java_tokens_sources = [] diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc index ef0fce7..cc8c06d 100644 --- a/src/gen_tokens.cc +++ b/src/gen_tokens.cc @@ -88,7 +88,8 @@ class Generator { // Find the Elements that has at least one terminal or character class as symbol // These will be the different tokens the tokenizer can return void Generator::find_specific_elements(grammar::Element const& root) { - if (std::ranges::any_of(root.definitions, [](auto const& definition) { + if (root.definitions.empty() || + std::ranges::any_of(root.definitions, [](auto const& definition) { return definition.symbols.size() > 1 || definition.symbols[0].type == grammar::Symbol::Type::kTerminal; })) { @@ -233,7 +234,7 @@ void write_character_class_matchers(std::ostream& out, << "std::optional<size_t> TokenMatcher::matchLineTerminator" << "(std::string_view str) {\n" // Tokenizer normally reads one line at a time, there is only - // one construct (traditional comment) that needs it. + // a few constructs (traditional comment, textblock) that needs it. // So match synthetic '\n' or report that it was needed if we are at // end of string. << " if (str.empty()) {\n" @@ -361,6 +362,22 @@ void write_character_class_matchers(std::ostream& out, } std::ostream& quote(std::ostream& out, std::string_view in) { + int use_raw_string = 1; + for (auto c : in) { + if (c == '"' || c == '\\' || c == '\n') { + use_raw_string = 2; + } else if (c < ' ' || (c & 0x80)) { + use_raw_string = 0; + break; + } + } + if (use_raw_string == 2) { + out << "R\"("; + out << in; + out << ")\""; + return out; + } + out << '"'; bool avoid_digit = false; for (auto c : in) { @@ -569,6 +586,7 @@ bool Generator::write_matcher(std::ostream& out, bool have_internal = next_internal; next_internal = false; ReturnType symbol_return_type = return_type; + bool zero_or_more_with_terminal = false; if (symbol.optional != grammar::Symbol::Optional::kRequired && i + 1 < definition.symbols.size() && @@ -613,6 +631,33 @@ bool Generator::write_matcher(std::ostream& out, } out << indent << "while (true) {\n"; indent2 += " "; + + if (i + 1 < definition.symbols.size() && + definition.symbols[i + 1].optional == + grammar::Symbol::Optional::kRequired && + definition.symbols[i + 1].type == + grammar::Symbol::Type::kTerminal) { + if (symbol_return_type == return_type) { + out << indent2 << "ret = "; + } else { + out << indent2 << "ret_internal = "; + } + write_matcher(out, definition.symbols[i + 1], symbol_return_type, + "str.substr(tot)"); + out << ";\n"; + if (symbol_return_type != return_type) { + out << indent2 << "ret = ret_internal"; + match_return_type(out, symbol_return_type, "", return_type); + out << ";\n"; + } + out << indent2 << "if (ret.has_value()) {\n" + << indent2 << " tot += ret" << size_suffix << ";\n"; + if (last_internal) + out << indent2 << " last_internal = ret->first;\n"; + out << indent2 << " break;\n" << indent2 << "}\n"; + + zero_or_more_with_terminal = true; + } break; case grammar::Symbol::Optional::kExcluded: std::cerr << "Excluded mixed with conditional\n"; @@ -678,9 +723,15 @@ bool Generator::write_matcher(std::ostream& out, << indent2 << " last_internal = ret->first;\n"; break; case grammar::Symbol::Optional::kZeroOrMore: - out << indent2 << "if (!ret.has_value())\n" - << indent2 << " break;\n" - << indent2 << "tot += ret" << size_suffix << ";\n"; + out << indent2 << "if (!ret.has_value())\n"; + if (zero_or_more_with_terminal) { + out << indent2 << " return ret;\n"; + // Skip next symbol as it was already used to terminate the loop + ++i; + } else { + out << indent2 << " break;\n"; + } + out << indent2 << "tot += ret" << size_suffix << ";\n"; if (last_internal) out << indent2 << "last_internal = ret->first;\n"; out << indent << "}\n"; @@ -742,13 +793,12 @@ bool Generator::write_matcher(std::ostream& out, switch (return_type) { case ReturnType::kSize: out << "[[nodiscard]]\n" - << "std::optional<size_t> TokenMatcher::match" << element.name - << "(std::string_view str) {\n"; + << "std::optional<size_t> TokenMatcher::match" << element.name; break; case ReturnType::kTokenAndSize: out << "[[nodiscard]]\n" << "std::optional<std::pair<Token, size_t>> TokenMatcher::match" - << element.name << "(std::string_view str) {\n"; + << element.name; if (specific_tokens_.contains(element.name)) { sub_return_type = ReturnType::kSize; @@ -759,11 +809,19 @@ bool Generator::write_matcher(std::ostream& out, out << "[[nodiscard]]\n" << "std::optional<std::pair<TokenMatcher::Internal, size_t>> " "TokenMatcher::match" - << element.name << "(std::string_view str) {\n"; + << element.name; break; } - if (element.definitions.size() == 1) { + if (element.definitions.empty()) { + out << "(std::string_view /* str */) {\n"; + } else { + out << "(std::string_view str) {\n"; + } + + if (element.definitions.empty()) { + out << " return std::nullopt;\n"; + } else if (element.definitions.size() == 1) { if (make_token) { out << " auto ret = [this, str]() -> std::optional<size_t> {\n"; if (!write_matcher(out, element.definitions[0], sub_return_type, diff --git a/src/grammar.cc b/src/grammar.cc index 25c4d64..6ed2766 100644 --- a/src/grammar.cc +++ b/src/grammar.cc @@ -127,11 +127,6 @@ class GrammarLoader { auto it = second_pass_elements.begin(); for (auto const& pair : first_pass_elements) { auto const& element = *it++; - if (pair.second.definitions.empty()) { - errors_.err(pair.second.loc, - std::format("No definitions for {}", pair.first)); - continue; - } std::vector<std::string_view> in_symbols; for (auto const& in_definition : pair.second.definitions) { str::split(in_definition, in_symbols); @@ -247,10 +242,42 @@ class GrammarLoader { "No root element found"); } + optimize(second_pass_elements); + return std::make_unique<GrammarImpl>(std::move(second_pass_elements)); } private: + static void optimize(std::vector<std::unique_ptr<Element>> const& elements) { + merge_terminals(elements); + } + + static void merge_terminals(std::vector<std::unique_ptr<Element>> const& elements) { + for (auto const& element : elements) { + for (auto& definition : element->definitions) { + auto it = definition.symbols.begin(); + while (it != definition.symbols.end()) { + if (it->type != Symbol::Type::kTerminal) { + ++it; + continue; + } + + auto it2 = it + 1; + if (it2 == definition.symbols.end()) + break; + if (it2->type != Symbol::Type::kTerminal || + it->optional != it2->optional) { + ++it; + continue; + } + + it->value += it2->value; + definition.symbols.erase(it2); + } + } + } + } + std::unique_ptr<line::Reader> reader_; std::vector<std::string> const& character_classes_; src::Errors& errors_; diff --git a/src/java_tokens.cc b/src/java_tokens.cc index 1ba40a3..42c310b 100644 --- a/src/java_tokens.cc +++ b/src/java_tokens.cc @@ -1,6 +1,7 @@ #include "java_tokens.hh" #include "errors.hh" +#include "java_tokens_java-21.hh" #include "java_tokens_java-8.hh" #include "java_uescape.hh" #include "str.hh" @@ -62,9 +63,13 @@ class TokensImpl : public Tokens { break; line_tmp_.append(maybe_line.value()); got_any = true; - // Simple check, it might not actually end the comment but if so tokenizer will complain + // Simple check, it might not actually be true but if so tokenizer will complain // about reaching line_end again. - if (maybe_line->contains("*/")) + auto stop = (maybe_token_pair.has_value() && + maybe_token_pair->first == MatchToken::kStringLiteral) + ? R"(""")" + : "*/"; + if (maybe_line->contains(stop)) break; line_tmp_.push_back('\n'); } @@ -118,8 +123,11 @@ class TokensImpl : public Tokens { case MatchToken::kIdentifier: token.type = Token::Type::kIdentifier; break; - case MatchToken::kKeyword: - token.type = Token::Type::kKeyword; + case MatchToken::kReservedKeyword: + token.type = Token::Type::kReservedKeyword; + break; + case MatchToken::kContextualKeyword: + token.type = Token::Type::kContextualKeyword; break; case MatchToken::kNullLiteral: token.type = Token::Type::kLiteralNull; @@ -138,6 +146,13 @@ class TokensImpl : public Tokens { token.str = unescape_if_needed(token.str.substr(1, token.str.size() - 2)); break; + case MatchToken::kTextBlock: { + token.type = Token::Type::kLiteralString; + auto start = token.str.find('\n', 3) + 1; + token.str = unescape_if_needed(trim_indent( + token.str.substr(start, token.str.size() - 3 - start))); + break; + } case MatchToken::kTraditionalComment: { token.type = Token::Type::kComment; size_t s = 2; @@ -368,6 +383,39 @@ class TokensImpl : public Tokens { return count; } + static size_t indent(std::string_view str) { + size_t i = 0; + while (i < str.size() && + (str[i] == ' ' || str[i] == '\t' || str[i] == '\f')) + ++i; + return i; + } + + std::string_view trim_indent(std::string_view str) { + auto lines = str::split(str, '\n', /* keep_empty */ true); + auto it = lines.begin(); + auto min_indent = indent(*it); + if (min_indent == 0) + return str; + for (++it; it != lines.end(); ++it) { + auto i = indent(*it); + if (i < min_indent) { + if (i == 0) + return str; + min_indent = i; + } + } + trim_tmp_.clear(); + trim_tmp_.reserve(str.size()); + for (auto line : lines) { + trim_tmp_.append(line, min_indent); + trim_tmp_.push_back('\n'); + } + // remove last '\n' + trim_tmp_.resize(trim_tmp_.size() - 1); + return trim_tmp_; + } + std::unique_ptr<u8::line::Reader> reader_; std::unique_ptr<src::Errors> errors_; TokensConfig const config_; @@ -376,6 +424,7 @@ class TokensImpl : public Tokens { std::string line_tmp_; Location location_; std::string unescape_tmp_; + std::string trim_tmp_; }; } // namespace @@ -387,6 +436,10 @@ std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader, case Version::kJava8: return std::make_unique<TokensImpl<java_8::TokenMatcher, java_8::Token>>( std::move(reader), std::move(errors), config); + case Version::kJava21: + return std::make_unique< + TokensImpl<java_21::TokenMatcher, java_21::Token>>( + std::move(reader), std::move(errors), config); } std::unreachable(); } diff --git a/src/java_tokens.hh b/src/java_tokens.hh index 6fbefcb..c4e27c0 100644 --- a/src/java_tokens.hh +++ b/src/java_tokens.hh @@ -25,7 +25,10 @@ struct Token { kIdentifier, // str is keyword, int_value is Keyword index - kKeyword, + kReservedKeyword, + + // str is keyword, int_value is Keyword index + kContextualKeyword, // str is separator, int_value is Separator index kSeparator, diff --git a/src/java_version.hh b/src/java_version.hh index 444ae36..4877263 100644 --- a/src/java_version.hh +++ b/src/java_version.hh @@ -7,8 +7,9 @@ namespace java { enum class Version : uint8_t { kJava8 = 8, + kJava21 = 21, - kMax = kJava8, + kMax = kJava21, }; } // namespace java diff --git a/test/java_tokens.cc b/test/java_tokens.cc index 1c69196..cb1ae73 100644 --- a/test/java_tokens.cc +++ b/test/java_tokens.cc @@ -29,7 +29,7 @@ TEST_P(JavaTokens, empty_class) { java::TokensConfig{.version = GetParam()}); auto ret = tokens->read(); ASSERT_TRUE(ret.has_value()); - EXPECT_EQ(java::Token::Type::kKeyword, ret->type); + EXPECT_EQ(java::Token::Type::kReservedKeyword, ret->type); EXPECT_EQ("class", ret->str); EXPECT_EQ(1, ret->loc.line); EXPECT_EQ(0, ret->loc.column); @@ -602,5 +602,57 @@ TEST_P(JavaTokens, null) { EXPECT_EQ(io::ReadError::Eof, ret.error()); } +TEST_P(JavaTokens, textblock) { + auto input = io::memory(R"(String html = """ + <html> + <body> + <p>Hello, world</p> + </body> + </html> + """;)"); + auto tokens = java::open(std::move(input), make_errors(), + java::TokensConfig{.version = GetParam()}); + + auto ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kIdentifier, ret->type); + EXPECT_EQ("String", ret->str); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kIdentifier, ret->type); + EXPECT_EQ("html", ret->str); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kOperator, ret->type); + EXPECT_EQ("=", ret->str); + ret = tokens->read(); + if (std::to_underlying(GetParam()) >= 15) { + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralString, ret->type); + EXPECT_EQ(R"(<html> + <body> + <p>Hello, world</p> + </body> +</html> +)", + ret->str); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kSeparator, ret->type); + EXPECT_EQ(";", ret->str); + ret = tokens->read(); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::ReadError::Eof, ret.error()); + } else { + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kLiteralString, ret->type); + EXPECT_EQ("", ret->str); + ret = tokens->read(); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(java::Token::Type::kError, ret->type); + } +} + INSTANTIATE_TEST_SUITE_P(AllVersions, JavaTokens, - testing::Values(java::Version::kJava8)); + testing::Values(java::Version::kJava8, + java::Version::kJava21)); |
