#include "grammar.hh" #include "errors.hh" #include "line.hh" #include "location.hh" #include "str.hh" #include #include #include #include #include #include #include #include #include #include #include #include namespace grammar { namespace { class GrammarImpl : public Grammar { public: explicit GrammarImpl(std::vector> elements) : elements_(std::move(elements)) {} [[nodiscard]] Element const& root() const override { return *elements_[0]; } private: std::vector> elements_; }; struct FirstPassElement { src::Location loc; std::vector definitions; explicit FirstPassElement(src::Location loc) : loc(loc) {} }; class GrammarLoader { public: GrammarLoader(std::unique_ptr reader, std::vector const& character_classes, src::Errors& errors) : reader_(line::open(std::move(reader))), character_classes_(character_classes), errors_(errors) {} std::unique_ptr load() { // Read whole file in a first pass, before parsing definitions. std::map first_pass_elements; FirstPassElement* last_element = nullptr; std::map> cc_lookup; for (uint8_t i = 0; i < static_cast(character_classes_.size()); ++i) { cc_lookup.emplace(character_classes_[i], i); } while (true) { auto line = reader_->read(); auto loc = src::Location(reader_->number(), 0); if (!line.has_value()) { if (line.error() == io::ReadError::Eof) break; errors_.err(loc, "Error reading"); return nullptr; } if (line.value().empty() || line.value().front() == '#') continue; if (line.value().front() == ' ') { // Continue on last element size_t i = 1; while (i < line.value().size() && line.value()[i] == ' ') ++i; if (i == line.value().size()) { errors_.err(loc, "Unexpected line, only spaces"); continue; } if (last_element == nullptr) { errors_.err(loc, "Expected element before indented lines"); continue; } last_element->definitions.emplace_back(line.value().substr(i)); } else { // New element if (line.value().back() != ':') { errors_.err( loc, "Unexpected line, not indented but also not ending with a ':'"); continue; } auto name = line.value().substr(0, line.value().size() - 1); if (cc_lookup.contains(name)) { errors_.warn( loc, std::format("Element {} overrides character class", name)); } auto pair = first_pass_elements.emplace(name, FirstPassElement(loc)); if (!pair.second) { errors_.err(loc, std::format("Duplicate element {}", name)); } last_element = &pair.first->second; } } if (first_pass_elements.empty()) { errors_.err(src::Location(reader_->number(), 0), "No elements found"); return nullptr; } std::vector> second_pass_elements; std::map> second_pass_lookup; for (auto const& pair : first_pass_elements) { auto element = std::make_unique(); element->name = pair.first; second_pass_lookup.emplace(element->name, second_pass_elements.size()); second_pass_elements.emplace_back(std::move(element)); } size_t i = 0; for (auto const& pair : first_pass_elements) { for (auto const& in_definition : pair.second.definitions) { auto out_symbols = parse_definition(pair.second.loc, second_pass_elements, second_pass_lookup, cc_lookup, in_definition); if (out_symbols.empty()) { errors_.err(pair.second.loc, "no symbols found in definition"); continue; } second_pass_elements[i]->definitions.emplace_back( Definition{.symbols = std::move(out_symbols)}); } ++i; } // Find root and move it first (if needed) std::vector used(second_pass_elements.size(), 0); for (auto const& element : second_pass_elements) { for (auto const& definition : element->definitions) { for (auto const& symbol : definition.symbols) { switch (symbol.type) { case Symbol::Type::kTerminal: case Symbol::Type::kCharacterClass: break; case Symbol::Type::kNonTerminal: if (!symbol.element->name.empty()) used[second_pass_lookup.find(symbol.element->name)->second]++; break; } } } } std::optional root_index; for (size_t i = 0; i < used.size(); ++i) { if (used[i] == 0 && !second_pass_elements[i]->name.empty()) { if (root_index.has_value()) { errors_.warn(first_pass_elements.find(second_pass_elements[i]->name) ->second.loc, std::format("{} is not referenced but also not root", second_pass_elements[i]->name)); } else { root_index = i; } } } if (root_index.has_value()) { if (root_index.value() != 0) { std::swap(second_pass_elements[0], second_pass_elements[root_index.value()]); } } else { errors_.err( first_pass_elements.find(second_pass_elements[0]->name)->second.loc, "No root element found"); } optimize(second_pass_elements); return std::make_unique(std::move(second_pass_elements)); } private: static void optimize(std::vector> const& elements) { merge_terminals(elements); } static void merge_terminals( std::vector> const& elements) { for (auto const& element : elements) { for (auto& definition : element->definitions) { auto it = definition.symbols.begin(); while (it != definition.symbols.end()) { if (it->type != Symbol::Type::kTerminal) { ++it; continue; } auto it2 = it + 1; if (it2 == definition.symbols.end()) break; if (it2->type != Symbol::Type::kTerminal || it->optional != it2->optional) { ++it; continue; } it->value += it2->value; definition.symbols.erase(it2); } } } } std::vector parse_definition( src::Location const& loc, std::vector>& non_terminal, std::map>& non_terminal_lookup, std::map> const& cc_lookup, std::string_view in_definition) { std::vector out_symbols; bool exclude = false; bool expect_not = false; while (!in_definition.empty()) { auto symbol = parse_symbol(loc, non_terminal, non_terminal_lookup, cc_lookup, in_definition, exclude, expect_not); if (symbol.has_value()) { out_symbols.push_back(std::move(symbol.value())); } } return out_symbols; } std::optional parse_symbol( src::Location const& loc, std::vector>& non_terminal, std::map>& non_terminal_lookup, std::map> const& cc_lookup, std::string_view& input, bool& exclude, bool& expect_not, char extra_terminator = '\0') { Symbol out_symbol; if ((input[0] == '{' || input[0] == '[') && input.size() > 1 && input[1] != ' ') { char end = input[0] == '{' ? '}' : ']'; std::vector inner_symbols; bool inner_exclude = false; bool inner_expect_not = false; input = input.substr(1); while (!input.empty() && input.front() != end) { auto symbol = parse_symbol(loc, non_terminal, non_terminal_lookup, cc_lookup, input, inner_exclude, inner_expect_not, end); if (symbol.has_value()) { inner_symbols.push_back(std::move(symbol.value())); } } if (input.empty()) { errors_.err(loc, "unclosed sub-symbol"); return std::nullopt; } if (input.size() == 1 || input[1] != ' ') { input = input.substr(1); } else { input = input.substr(2); } if (inner_symbols.empty()) { errors_.err(loc, "empty sub-symbol"); return std::nullopt; } if (inner_symbols.size() == 1) { out_symbol = inner_symbols[0]; } else { auto anon_element = std::make_unique(); anon_element->definitions.emplace_back( Definition{.symbols = std::move(inner_symbols)}); non_terminal.push_back(std::move(anon_element)); out_symbol.type = Symbol::Type::kNonTerminal; out_symbol.element = non_terminal.back().get(); } if (exclude) { errors_.err(loc, "Optional and exclude doesn't work together"); } else if (end == '}') { out_symbol.optional = Symbol::Optional::kZeroOrMore; } else /* if (end == '[') */ { out_symbol.optional = Symbol::Optional::kZeroOrOne; } return out_symbol; } std::string_view::size_type end; if (extra_terminator) { char tmp[2]; tmp[0] = extra_terminator; tmp[1] = ' '; end = input.find_first_of(std::string_view{tmp, 2}); } else { end = input.find(' '); } std::string_view in_symbol; if (end == std::string_view::npos) { in_symbol = input; input = std::string_view{}; } else { in_symbol = input.substr(0, end); if (input[end] == ' ') { input = input.substr(end + 1); } else { input = input.substr(end); } } if (exclude) { if (in_symbol == "or") return std::nullopt; out_symbol.optional = Symbol::Optional::kExcluded; } else if (expect_not) { expect_not = false; if (in_symbol == "not") { exclude = true; } else { errors_.err(loc, "but is not followed by not"); } return std::nullopt; } if (in_symbol == "but") { expect_not = true; return std::nullopt; } auto it2 = non_terminal_lookup.find(in_symbol); if (it2 != non_terminal_lookup.end()) { out_symbol.type = Symbol::Type::kNonTerminal; out_symbol.element = non_terminal[it2->second].get(); } else { auto it3 = cc_lookup.find(in_symbol); if (it3 != cc_lookup.end()) { out_symbol.type = Symbol::Type::kCharacterClass; out_symbol.char_class = it3->second; } else { out_symbol.type = Symbol::Type::kTerminal; out_symbol.value = in_symbol; } } return out_symbol; } std::unique_ptr reader_; std::vector const& character_classes_; src::Errors& errors_; }; } // namespace std::unique_ptr load(std::unique_ptr reader, std::vector const& character_classes, src::Errors& errors) { GrammarLoader loader(std::move(reader), character_classes, errors); return loader.load(); } } // namespace grammar