diff options
Diffstat (limited to 'src/gen_tokens.cc')
| -rw-r--r-- | src/gen_tokens.cc | 78 |
1 files changed, 68 insertions, 10 deletions
diff --git a/src/gen_tokens.cc b/src/gen_tokens.cc index ef0fce7..cc8c06d 100644 --- a/src/gen_tokens.cc +++ b/src/gen_tokens.cc @@ -88,7 +88,8 @@ class Generator { // Find the Elements that has at least one terminal or character class as symbol // These will be the different tokens the tokenizer can return void Generator::find_specific_elements(grammar::Element const& root) { - if (std::ranges::any_of(root.definitions, [](auto const& definition) { + if (root.definitions.empty() || + std::ranges::any_of(root.definitions, [](auto const& definition) { return definition.symbols.size() > 1 || definition.symbols[0].type == grammar::Symbol::Type::kTerminal; })) { @@ -233,7 +234,7 @@ void write_character_class_matchers(std::ostream& out, << "std::optional<size_t> TokenMatcher::matchLineTerminator" << "(std::string_view str) {\n" // Tokenizer normally reads one line at a time, there is only - // one construct (traditional comment) that needs it. + // a few constructs (traditional comment, textblock) that needs it. // So match synthetic '\n' or report that it was needed if we are at // end of string. << " if (str.empty()) {\n" @@ -361,6 +362,22 @@ void write_character_class_matchers(std::ostream& out, } std::ostream& quote(std::ostream& out, std::string_view in) { + int use_raw_string = 1; + for (auto c : in) { + if (c == '"' || c == '\\' || c == '\n') { + use_raw_string = 2; + } else if (c < ' ' || (c & 0x80)) { + use_raw_string = 0; + break; + } + } + if (use_raw_string == 2) { + out << "R\"("; + out << in; + out << ")\""; + return out; + } + out << '"'; bool avoid_digit = false; for (auto c : in) { @@ -569,6 +586,7 @@ bool Generator::write_matcher(std::ostream& out, bool have_internal = next_internal; next_internal = false; ReturnType symbol_return_type = return_type; + bool zero_or_more_with_terminal = false; if (symbol.optional != grammar::Symbol::Optional::kRequired && i + 1 < definition.symbols.size() && @@ -613,6 +631,33 @@ bool Generator::write_matcher(std::ostream& out, } out << indent << "while (true) {\n"; indent2 += " "; + + if (i + 1 < definition.symbols.size() && + definition.symbols[i + 1].optional == + grammar::Symbol::Optional::kRequired && + definition.symbols[i + 1].type == + grammar::Symbol::Type::kTerminal) { + if (symbol_return_type == return_type) { + out << indent2 << "ret = "; + } else { + out << indent2 << "ret_internal = "; + } + write_matcher(out, definition.symbols[i + 1], symbol_return_type, + "str.substr(tot)"); + out << ";\n"; + if (symbol_return_type != return_type) { + out << indent2 << "ret = ret_internal"; + match_return_type(out, symbol_return_type, "", return_type); + out << ";\n"; + } + out << indent2 << "if (ret.has_value()) {\n" + << indent2 << " tot += ret" << size_suffix << ";\n"; + if (last_internal) + out << indent2 << " last_internal = ret->first;\n"; + out << indent2 << " break;\n" << indent2 << "}\n"; + + zero_or_more_with_terminal = true; + } break; case grammar::Symbol::Optional::kExcluded: std::cerr << "Excluded mixed with conditional\n"; @@ -678,9 +723,15 @@ bool Generator::write_matcher(std::ostream& out, << indent2 << " last_internal = ret->first;\n"; break; case grammar::Symbol::Optional::kZeroOrMore: - out << indent2 << "if (!ret.has_value())\n" - << indent2 << " break;\n" - << indent2 << "tot += ret" << size_suffix << ";\n"; + out << indent2 << "if (!ret.has_value())\n"; + if (zero_or_more_with_terminal) { + out << indent2 << " return ret;\n"; + // Skip next symbol as it was already used to terminate the loop + ++i; + } else { + out << indent2 << " break;\n"; + } + out << indent2 << "tot += ret" << size_suffix << ";\n"; if (last_internal) out << indent2 << "last_internal = ret->first;\n"; out << indent << "}\n"; @@ -742,13 +793,12 @@ bool Generator::write_matcher(std::ostream& out, switch (return_type) { case ReturnType::kSize: out << "[[nodiscard]]\n" - << "std::optional<size_t> TokenMatcher::match" << element.name - << "(std::string_view str) {\n"; + << "std::optional<size_t> TokenMatcher::match" << element.name; break; case ReturnType::kTokenAndSize: out << "[[nodiscard]]\n" << "std::optional<std::pair<Token, size_t>> TokenMatcher::match" - << element.name << "(std::string_view str) {\n"; + << element.name; if (specific_tokens_.contains(element.name)) { sub_return_type = ReturnType::kSize; @@ -759,11 +809,19 @@ bool Generator::write_matcher(std::ostream& out, out << "[[nodiscard]]\n" << "std::optional<std::pair<TokenMatcher::Internal, size_t>> " "TokenMatcher::match" - << element.name << "(std::string_view str) {\n"; + << element.name; break; } - if (element.definitions.size() == 1) { + if (element.definitions.empty()) { + out << "(std::string_view /* str */) {\n"; + } else { + out << "(std::string_view str) {\n"; + } + + if (element.definitions.empty()) { + out << " return std::nullopt;\n"; + } else if (element.definitions.size() == 1) { if (make_token) { out << " auto ret = [this, str]() -> std::optional<size_t> {\n"; if (!write_matcher(out, element.definitions[0], sub_return_type, |
