summaryrefslogtreecommitdiff
path: root/src/gen_tokens.cc
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@spawned.biz>2025-09-29 09:39:49 +0200
committerJoel Klinghed <the_jk@spawned.biz>2025-09-29 09:50:47 +0200
commitd196d51e07f50f3510c43ad375c5559b58860023 (patch)
tree3432b8e99e306d0ece9f29ddad1e2945f88a1481 /src/gen_tokens.cc
parent1e9e51dae1c01bab7562911b958c47528b8011c8 (diff)
java: Add tokens support for Java 21
Some new keywords, I opted to modify java-8 grammar to use the new names, even if they are not going to match anything. Makes the tokenizer easier to write.
Diffstat (limited to 'src/gen_tokens.cc')
-rw-r--r--src/gen_tokens.cc78
1 files changed, 68 insertions, 10 deletions
diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc
index ef0fce7..cc8c06d 100644
--- a/src/gen_tokens.cc
+++ b/src/gen_tokens.cc
@@ -88,7 +88,8 @@ class Generator {
// Find the Elements that has at least one terminal or character class as symbol
// These will be the different tokens the tokenizer can return
void Generator::find_specific_elements(grammar::Element const& root) {
- if (std::ranges::any_of(root.definitions, [](auto const& definition) {
+ if (root.definitions.empty() ||
+ std::ranges::any_of(root.definitions, [](auto const& definition) {
return definition.symbols.size() > 1 ||
definition.symbols[0].type == grammar::Symbol::Type::kTerminal;
})) {
@@ -233,7 +234,7 @@ void write_character_class_matchers(std::ostream& out,
<< "std::optional<size_t> TokenMatcher::matchLineTerminator"
<< "(std::string_view str) {\n"
// Tokenizer normally reads one line at a time, there is only
- // one construct (traditional comment) that needs it.
+ // a few constructs (traditional comment, textblock) that needs it.
// So match synthetic '\n' or report that it was needed if we are at
// end of string.
<< " if (str.empty()) {\n"
@@ -361,6 +362,22 @@ void write_character_class_matchers(std::ostream& out,
}
std::ostream& quote(std::ostream& out, std::string_view in) {
+ int use_raw_string = 1;
+ for (auto c : in) {
+ if (c == '"' || c == '\\' || c == '\n') {
+ use_raw_string = 2;
+ } else if (c < ' ' || (c & 0x80)) {
+ use_raw_string = 0;
+ break;
+ }
+ }
+ if (use_raw_string == 2) {
+ out << "R\"(";
+ out << in;
+ out << ")\"";
+ return out;
+ }
+
out << '"';
bool avoid_digit = false;
for (auto c : in) {
@@ -569,6 +586,7 @@ bool Generator::write_matcher(std::ostream& out,
bool have_internal = next_internal;
next_internal = false;
ReturnType symbol_return_type = return_type;
+ bool zero_or_more_with_terminal = false;
if (symbol.optional != grammar::Symbol::Optional::kRequired &&
i + 1 < definition.symbols.size() &&
@@ -613,6 +631,33 @@ bool Generator::write_matcher(std::ostream& out,
}
out << indent << "while (true) {\n";
indent2 += " ";
+
+ if (i + 1 < definition.symbols.size() &&
+ definition.symbols[i + 1].optional ==
+ grammar::Symbol::Optional::kRequired &&
+ definition.symbols[i + 1].type ==
+ grammar::Symbol::Type::kTerminal) {
+ if (symbol_return_type == return_type) {
+ out << indent2 << "ret = ";
+ } else {
+ out << indent2 << "ret_internal = ";
+ }
+ write_matcher(out, definition.symbols[i + 1], symbol_return_type,
+ "str.substr(tot)");
+ out << ";\n";
+ if (symbol_return_type != return_type) {
+ out << indent2 << "ret = ret_internal";
+ match_return_type(out, symbol_return_type, "", return_type);
+ out << ";\n";
+ }
+ out << indent2 << "if (ret.has_value()) {\n"
+ << indent2 << " tot += ret" << size_suffix << ";\n";
+ if (last_internal)
+ out << indent2 << " last_internal = ret->first;\n";
+ out << indent2 << " break;\n" << indent2 << "}\n";
+
+ zero_or_more_with_terminal = true;
+ }
break;
case grammar::Symbol::Optional::kExcluded:
std::cerr << "Excluded mixed with conditional\n";
@@ -678,9 +723,15 @@ bool Generator::write_matcher(std::ostream& out,
<< indent2 << " last_internal = ret->first;\n";
break;
case grammar::Symbol::Optional::kZeroOrMore:
- out << indent2 << "if (!ret.has_value())\n"
- << indent2 << " break;\n"
- << indent2 << "tot += ret" << size_suffix << ";\n";
+ out << indent2 << "if (!ret.has_value())\n";
+ if (zero_or_more_with_terminal) {
+ out << indent2 << " return ret;\n";
+ // Skip next symbol as it was already used to terminate the loop
+ ++i;
+ } else {
+ out << indent2 << " break;\n";
+ }
+ out << indent2 << "tot += ret" << size_suffix << ";\n";
if (last_internal)
out << indent2 << "last_internal = ret->first;\n";
out << indent << "}\n";
@@ -742,13 +793,12 @@ bool Generator::write_matcher(std::ostream& out,
switch (return_type) {
case ReturnType::kSize:
out << "[[nodiscard]]\n"
- << "std::optional<size_t> TokenMatcher::match" << element.name
- << "(std::string_view str) {\n";
+ << "std::optional<size_t> TokenMatcher::match" << element.name;
break;
case ReturnType::kTokenAndSize:
out << "[[nodiscard]]\n"
<< "std::optional<std::pair<Token, size_t>> TokenMatcher::match"
- << element.name << "(std::string_view str) {\n";
+ << element.name;
if (specific_tokens_.contains(element.name)) {
sub_return_type = ReturnType::kSize;
@@ -759,11 +809,19 @@ bool Generator::write_matcher(std::ostream& out,
out << "[[nodiscard]]\n"
<< "std::optional<std::pair<TokenMatcher::Internal, size_t>> "
"TokenMatcher::match"
- << element.name << "(std::string_view str) {\n";
+ << element.name;
break;
}
- if (element.definitions.size() == 1) {
+ if (element.definitions.empty()) {
+ out << "(std::string_view /* str */) {\n";
+ } else {
+ out << "(std::string_view str) {\n";
+ }
+
+ if (element.definitions.empty()) {
+ out << " return std::nullopt;\n";
+ } else if (element.definitions.size() == 1) {
if (make_token) {
out << " auto ret = [this, str]() -> std::optional<size_t> {\n";
if (!write_matcher(out, element.definitions[0], sub_return_type,