summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/gen_tokens.cc78
-rw-r--r--src/grammar.cc37
-rw-r--r--src/java_tokens.cc61
-rw-r--r--src/java_tokens.hh5
-rw-r--r--src/java_version.hh3
5 files changed, 163 insertions, 21 deletions
diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc
index ef0fce7..cc8c06d 100644
--- a/src/gen_tokens.cc
+++ b/src/gen_tokens.cc
@@ -88,7 +88,8 @@ class Generator {
// Find the Elements that has at least one terminal or character class as symbol
// These will be the different tokens the tokenizer can return
void Generator::find_specific_elements(grammar::Element const& root) {
- if (std::ranges::any_of(root.definitions, [](auto const& definition) {
+ if (root.definitions.empty() ||
+ std::ranges::any_of(root.definitions, [](auto const& definition) {
return definition.symbols.size() > 1 ||
definition.symbols[0].type == grammar::Symbol::Type::kTerminal;
})) {
@@ -233,7 +234,7 @@ void write_character_class_matchers(std::ostream& out,
<< "std::optional<size_t> TokenMatcher::matchLineTerminator"
<< "(std::string_view str) {\n"
// Tokenizer normally reads one line at a time, there is only
- // one construct (traditional comment) that needs it.
+ // a few constructs (traditional comment, textblock) that needs it.
// So match synthetic '\n' or report that it was needed if we are at
// end of string.
<< " if (str.empty()) {\n"
@@ -361,6 +362,22 @@ void write_character_class_matchers(std::ostream& out,
}
std::ostream& quote(std::ostream& out, std::string_view in) {
+ int use_raw_string = 1;
+ for (auto c : in) {
+ if (c == '"' || c == '\\' || c == '\n') {
+ use_raw_string = 2;
+ } else if (c < ' ' || (c & 0x80)) {
+ use_raw_string = 0;
+ break;
+ }
+ }
+ if (use_raw_string == 2) {
+ out << "R\"(";
+ out << in;
+ out << ")\"";
+ return out;
+ }
+
out << '"';
bool avoid_digit = false;
for (auto c : in) {
@@ -569,6 +586,7 @@ bool Generator::write_matcher(std::ostream& out,
bool have_internal = next_internal;
next_internal = false;
ReturnType symbol_return_type = return_type;
+ bool zero_or_more_with_terminal = false;
if (symbol.optional != grammar::Symbol::Optional::kRequired &&
i + 1 < definition.symbols.size() &&
@@ -613,6 +631,33 @@ bool Generator::write_matcher(std::ostream& out,
}
out << indent << "while (true) {\n";
indent2 += " ";
+
+ if (i + 1 < definition.symbols.size() &&
+ definition.symbols[i + 1].optional ==
+ grammar::Symbol::Optional::kRequired &&
+ definition.symbols[i + 1].type ==
+ grammar::Symbol::Type::kTerminal) {
+ if (symbol_return_type == return_type) {
+ out << indent2 << "ret = ";
+ } else {
+ out << indent2 << "ret_internal = ";
+ }
+ write_matcher(out, definition.symbols[i + 1], symbol_return_type,
+ "str.substr(tot)");
+ out << ";\n";
+ if (symbol_return_type != return_type) {
+ out << indent2 << "ret = ret_internal";
+ match_return_type(out, symbol_return_type, "", return_type);
+ out << ";\n";
+ }
+ out << indent2 << "if (ret.has_value()) {\n"
+ << indent2 << " tot += ret" << size_suffix << ";\n";
+ if (last_internal)
+ out << indent2 << " last_internal = ret->first;\n";
+ out << indent2 << " break;\n" << indent2 << "}\n";
+
+ zero_or_more_with_terminal = true;
+ }
break;
case grammar::Symbol::Optional::kExcluded:
std::cerr << "Excluded mixed with conditional\n";
@@ -678,9 +723,15 @@ bool Generator::write_matcher(std::ostream& out,
<< indent2 << " last_internal = ret->first;\n";
break;
case grammar::Symbol::Optional::kZeroOrMore:
- out << indent2 << "if (!ret.has_value())\n"
- << indent2 << " break;\n"
- << indent2 << "tot += ret" << size_suffix << ";\n";
+ out << indent2 << "if (!ret.has_value())\n";
+ if (zero_or_more_with_terminal) {
+ out << indent2 << " return ret;\n";
+ // Skip next symbol as it was already used to terminate the loop
+ ++i;
+ } else {
+ out << indent2 << " break;\n";
+ }
+ out << indent2 << "tot += ret" << size_suffix << ";\n";
if (last_internal)
out << indent2 << "last_internal = ret->first;\n";
out << indent << "}\n";
@@ -742,13 +793,12 @@ bool Generator::write_matcher(std::ostream& out,
switch (return_type) {
case ReturnType::kSize:
out << "[[nodiscard]]\n"
- << "std::optional<size_t> TokenMatcher::match" << element.name
- << "(std::string_view str) {\n";
+ << "std::optional<size_t> TokenMatcher::match" << element.name;
break;
case ReturnType::kTokenAndSize:
out << "[[nodiscard]]\n"
<< "std::optional<std::pair<Token, size_t>> TokenMatcher::match"
- << element.name << "(std::string_view str) {\n";
+ << element.name;
if (specific_tokens_.contains(element.name)) {
sub_return_type = ReturnType::kSize;
@@ -759,11 +809,19 @@ bool Generator::write_matcher(std::ostream& out,
out << "[[nodiscard]]\n"
<< "std::optional<std::pair<TokenMatcher::Internal, size_t>> "
"TokenMatcher::match"
- << element.name << "(std::string_view str) {\n";
+ << element.name;
break;
}
- if (element.definitions.size() == 1) {
+ if (element.definitions.empty()) {
+ out << "(std::string_view /* str */) {\n";
+ } else {
+ out << "(std::string_view str) {\n";
+ }
+
+ if (element.definitions.empty()) {
+ out << " return std::nullopt;\n";
+ } else if (element.definitions.size() == 1) {
if (make_token) {
out << " auto ret = [this, str]() -> std::optional<size_t> {\n";
if (!write_matcher(out, element.definitions[0], sub_return_type,
diff --git a/src/grammar.cc b/src/grammar.cc
index 25c4d64..6ed2766 100644
--- a/src/grammar.cc
+++ b/src/grammar.cc
@@ -127,11 +127,6 @@ class GrammarLoader {
auto it = second_pass_elements.begin();
for (auto const& pair : first_pass_elements) {
auto const& element = *it++;
- if (pair.second.definitions.empty()) {
- errors_.err(pair.second.loc,
- std::format("No definitions for {}", pair.first));
- continue;
- }
std::vector<std::string_view> in_symbols;
for (auto const& in_definition : pair.second.definitions) {
str::split(in_definition, in_symbols);
@@ -247,10 +242,42 @@ class GrammarLoader {
"No root element found");
}
+ optimize(second_pass_elements);
+
return std::make_unique<GrammarImpl>(std::move(second_pass_elements));
}
private:
+ static void optimize(std::vector<std::unique_ptr<Element>> const& elements) {
+ merge_terminals(elements);
+ }
+
+ static void merge_terminals(std::vector<std::unique_ptr<Element>> const& elements) {
+ for (auto const& element : elements) {
+ for (auto& definition : element->definitions) {
+ auto it = definition.symbols.begin();
+ while (it != definition.symbols.end()) {
+ if (it->type != Symbol::Type::kTerminal) {
+ ++it;
+ continue;
+ }
+
+ auto it2 = it + 1;
+ if (it2 == definition.symbols.end())
+ break;
+ if (it2->type != Symbol::Type::kTerminal ||
+ it->optional != it2->optional) {
+ ++it;
+ continue;
+ }
+
+ it->value += it2->value;
+ definition.symbols.erase(it2);
+ }
+ }
+ }
+ }
+
std::unique_ptr<line::Reader> reader_;
std::vector<std::string> const& character_classes_;
src::Errors& errors_;
diff --git a/src/java_tokens.cc b/src/java_tokens.cc
index 1ba40a3..42c310b 100644
--- a/src/java_tokens.cc
+++ b/src/java_tokens.cc
@@ -1,6 +1,7 @@
#include "java_tokens.hh"
#include "errors.hh"
+#include "java_tokens_java-21.hh"
#include "java_tokens_java-8.hh"
#include "java_uescape.hh"
#include "str.hh"
@@ -62,9 +63,13 @@ class TokensImpl : public Tokens {
break;
line_tmp_.append(maybe_line.value());
got_any = true;
- // Simple check, it might not actually end the comment but if so tokenizer will complain
+ // Simple check, it might not actually be true but if so tokenizer will complain
// about reaching line_end again.
- if (maybe_line->contains("*/"))
+ auto stop = (maybe_token_pair.has_value() &&
+ maybe_token_pair->first == MatchToken::kStringLiteral)
+ ? R"(""")"
+ : "*/";
+ if (maybe_line->contains(stop))
break;
line_tmp_.push_back('\n');
}
@@ -118,8 +123,11 @@ class TokensImpl : public Tokens {
case MatchToken::kIdentifier:
token.type = Token::Type::kIdentifier;
break;
- case MatchToken::kKeyword:
- token.type = Token::Type::kKeyword;
+ case MatchToken::kReservedKeyword:
+ token.type = Token::Type::kReservedKeyword;
+ break;
+ case MatchToken::kContextualKeyword:
+ token.type = Token::Type::kContextualKeyword;
break;
case MatchToken::kNullLiteral:
token.type = Token::Type::kLiteralNull;
@@ -138,6 +146,13 @@ class TokensImpl : public Tokens {
token.str =
unescape_if_needed(token.str.substr(1, token.str.size() - 2));
break;
+ case MatchToken::kTextBlock: {
+ token.type = Token::Type::kLiteralString;
+ auto start = token.str.find('\n', 3) + 1;
+ token.str = unescape_if_needed(trim_indent(
+ token.str.substr(start, token.str.size() - 3 - start)));
+ break;
+ }
case MatchToken::kTraditionalComment: {
token.type = Token::Type::kComment;
size_t s = 2;
@@ -368,6 +383,39 @@ class TokensImpl : public Tokens {
return count;
}
+ static size_t indent(std::string_view str) {
+ size_t i = 0;
+ while (i < str.size() &&
+ (str[i] == ' ' || str[i] == '\t' || str[i] == '\f'))
+ ++i;
+ return i;
+ }
+
+ std::string_view trim_indent(std::string_view str) {
+ auto lines = str::split(str, '\n', /* keep_empty */ true);
+ auto it = lines.begin();
+ auto min_indent = indent(*it);
+ if (min_indent == 0)
+ return str;
+ for (++it; it != lines.end(); ++it) {
+ auto i = indent(*it);
+ if (i < min_indent) {
+ if (i == 0)
+ return str;
+ min_indent = i;
+ }
+ }
+ trim_tmp_.clear();
+ trim_tmp_.reserve(str.size());
+ for (auto line : lines) {
+ trim_tmp_.append(line, min_indent);
+ trim_tmp_.push_back('\n');
+ }
+ // remove last '\n'
+ trim_tmp_.resize(trim_tmp_.size() - 1);
+ return trim_tmp_;
+ }
+
std::unique_ptr<u8::line::Reader> reader_;
std::unique_ptr<src::Errors> errors_;
TokensConfig const config_;
@@ -376,6 +424,7 @@ class TokensImpl : public Tokens {
std::string line_tmp_;
Location location_;
std::string unescape_tmp_;
+ std::string trim_tmp_;
};
} // namespace
@@ -387,6 +436,10 @@ std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader,
case Version::kJava8:
return std::make_unique<TokensImpl<java_8::TokenMatcher, java_8::Token>>(
std::move(reader), std::move(errors), config);
+ case Version::kJava21:
+ return std::make_unique<
+ TokensImpl<java_21::TokenMatcher, java_21::Token>>(
+ std::move(reader), std::move(errors), config);
}
std::unreachable();
}
diff --git a/src/java_tokens.hh b/src/java_tokens.hh
index 6fbefcb..c4e27c0 100644
--- a/src/java_tokens.hh
+++ b/src/java_tokens.hh
@@ -25,7 +25,10 @@ struct Token {
kIdentifier,
// str is keyword, int_value is Keyword index
- kKeyword,
+ kReservedKeyword,
+
+ // str is keyword, int_value is Keyword index
+ kContextualKeyword,
// str is separator, int_value is Separator index
kSeparator,
diff --git a/src/java_version.hh b/src/java_version.hh
index 444ae36..4877263 100644
--- a/src/java_version.hh
+++ b/src/java_version.hh
@@ -7,8 +7,9 @@ namespace java {
enum class Version : uint8_t {
kJava8 = 8,
+ kJava21 = 21,
- kMax = kJava8,
+ kMax = kJava21,
};
} // namespace java