diff options
| author | Joel Klinghed <the_jk@spawned.biz> | 2025-09-27 20:11:32 +0200 |
|---|---|---|
| committer | Joel Klinghed <the_jk@spawned.biz> | 2025-09-28 22:48:24 +0200 |
| commit | c1ae5d53fb0fa7ceb9d6fc7a60c87df958ce37fe (patch) | |
| tree | f028a04619aa1b69f8b0aa72a5154f6ba1c09775 /src/grammar.cc | |
| parent | 2f13baa843bd1fb5db6630a2823681ffaff9fb11 (diff) | |
WIPWIP
Diffstat (limited to 'src/grammar.cc')
| -rw-r--r-- | src/grammar.cc | 268 |
1 files changed, 268 insertions, 0 deletions
diff --git a/src/grammar.cc b/src/grammar.cc new file mode 100644 index 0000000..25c4d64 --- /dev/null +++ b/src/grammar.cc @@ -0,0 +1,268 @@ +#include "grammar.hh" + +#include "errors.hh" +#include "line.hh" +#include "location.hh" +#include "str.hh" + +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <format> +#include <functional> +#include <map> +#include <memory> +#include <optional> +#include <string> +#include <string_view> +#include <utility> +#include <vector> + +namespace grammar { + +namespace { + +class GrammarImpl : public Grammar { + public: + explicit GrammarImpl(std::vector<std::unique_ptr<Element>> elements) + : elements_(std::move(elements)) {} + + [[nodiscard]] + Element const& root() const override { + return *elements_[0]; + } + + private: + std::vector<std::unique_ptr<Element>> elements_; +}; + +struct FirstPassElement { + src::Location loc; + std::vector<std::string> definitions; + + explicit FirstPassElement(src::Location loc) : loc(loc) {} +}; + +class GrammarLoader { + public: + GrammarLoader(std::unique_ptr<io::Reader> reader, + std::vector<std::string> const& character_classes, + src::Errors& errors) + : reader_(line::open(std::move(reader))), + character_classes_(character_classes), + errors_(errors) {} + + std::unique_ptr<Grammar> load() { + // Read whole file in a first pass, before parsing definitions. + std::map<std::string, FirstPassElement> first_pass_elements; + FirstPassElement* last_element = nullptr; + + std::map<std::string_view, uint8_t, std::less<>> cc_lookup; + for (uint8_t i = 0; i < static_cast<uint8_t>(character_classes_.size()); + ++i) { + cc_lookup.emplace(character_classes_[i], i); + } + + while (true) { + auto line = reader_->read(); + auto loc = src::Location(reader_->number(), 0); + if (!line.has_value()) { + if (line.error() == io::ReadError::Eof) + break; + errors_.err(loc, "Error reading"); + return nullptr; + } + if (line.value().empty() || line.value().front() == '#') + continue; + if (line.value().front() == ' ') { + // Continue on last element + size_t i = 1; + while (i < line.value().size() && line.value()[i] == ' ') + ++i; + if (i == line.value().size()) { + errors_.err(loc, "Unexpected line, only spaces"); + continue; + } + if (last_element == nullptr) { + errors_.err(loc, "Expected element before indented lines"); + continue; + } + last_element->definitions.emplace_back(line.value().substr(i)); + } else { + // New element + if (line.value().back() != ':') { + errors_.err( + loc, + "Unexpected line, not indented but also not ending with a ':'"); + continue; + } + + auto name = line.value().substr(0, line.value().size() - 1); + if (cc_lookup.contains(name)) { + errors_.warn( + loc, std::format("Element {} overrides character class", name)); + } + auto pair = first_pass_elements.emplace(name, FirstPassElement(loc)); + if (!pair.second) { + errors_.err(loc, std::format("Duplicate element {}", name)); + } + last_element = &pair.first->second; + } + } + + if (first_pass_elements.empty()) { + errors_.err(src::Location(reader_->number(), 0), "No elements found"); + return nullptr; + } + + std::vector<std::unique_ptr<Element>> second_pass_elements; + std::map<std::string_view, size_t, std::less<>> second_pass_lookup; + for (auto const& pair : first_pass_elements) { + auto element = std::make_unique<Element>(); + element->name = pair.first; + second_pass_lookup.emplace(element->name, second_pass_elements.size()); + second_pass_elements.emplace_back(std::move(element)); + } + + auto it = second_pass_elements.begin(); + for (auto const& pair : first_pass_elements) { + auto const& element = *it++; + if (pair.second.definitions.empty()) { + errors_.err(pair.second.loc, + std::format("No definitions for {}", pair.first)); + continue; + } + std::vector<std::string_view> in_symbols; + for (auto const& in_definition : pair.second.definitions) { + str::split(in_definition, in_symbols); + + std::vector<Symbol> out_symbols; + bool exclude = false; + bool expect_not = false; + for (auto in_symbol : in_symbols) { + Symbol out_symbol; + if (exclude) { + if (in_symbol == "or") + continue; + out_symbol.optional = Symbol::Optional::kExcluded; + } else if (expect_not) { + expect_not = false; + if (in_symbol == "not") { + exclude = true; + } else { + errors_.err(pair.second.loc, "but is not followed by not"); + } + continue; + } + if (in_symbol == "but") { + expect_not = true; + continue; + } + if (in_symbol.front() == '{' && in_symbol.back() == '}') { + if (exclude) { + errors_.err(pair.second.loc, + "Optional and exclude doesn't work together"); + } else { + out_symbol.optional = Symbol::Optional::kZeroOrMore; + } + in_symbol = in_symbol.substr(1, in_symbol.size() - 2); + } else if (in_symbol.front() == '[' && in_symbol.back() == ']') { + if (exclude) { + errors_.err(pair.second.loc, + "Optional and exclude doesn't work together"); + } else { + out_symbol.optional = Symbol::Optional::kZeroOrOne; + } + in_symbol = in_symbol.substr(1, in_symbol.size() - 2); + } + auto it2 = second_pass_lookup.find(in_symbol); + if (it2 != second_pass_lookup.end()) { + out_symbol.type = Symbol::Type::kNonTerminal; + out_symbol.element = second_pass_elements[it2->second].get(); + } else { + auto it3 = cc_lookup.find(in_symbol); + if (it3 != cc_lookup.end()) { + out_symbol.type = Symbol::Type::kCharacterClass; + out_symbol.char_class = it3->second; + } else { + out_symbol.type = Symbol::Type::kTerminal; + out_symbol.value = in_symbol; + } + } + out_symbols.emplace_back(std::move(out_symbol)); + } + + if (expect_not) { + errors_.err(pair.second.loc, "but is not followed by not"); + } + + if (out_symbols.empty()) { + errors_.err(pair.second.loc, "no symbols found in definition"); + continue; + } + + element->definitions.emplace_back( + Definition{.symbols = std::move(out_symbols)}); + } + } + + // Find root and move it first (if needed) + std::vector<size_t> used(second_pass_elements.size(), 0); + for (auto const& element : second_pass_elements) { + for (auto const& definition : element->definitions) { + for (auto const& symbol : definition.symbols) { + switch (symbol.type) { + case Symbol::Type::kTerminal: + case Symbol::Type::kCharacterClass: + break; + case Symbol::Type::kNonTerminal: + used[second_pass_lookup.find(symbol.element->name)->second]++; + break; + } + } + } + } + + std::optional<size_t> root_index; + for (size_t i = 0; i < used.size(); ++i) { + if (used[i] == 0) { + if (root_index.has_value()) { + errors_.warn(first_pass_elements.find(second_pass_elements[i]->name) + ->second.loc, + "Is not referenced but also not root"); + } else { + root_index = i; + } + } + } + + if (root_index.has_value()) { + if (root_index.value() != 0) { + std::swap(second_pass_elements[0], + second_pass_elements[root_index.value()]); + } + } else { + errors_.err( + first_pass_elements.find(second_pass_elements[0]->name)->second.loc, + "No root element found"); + } + + return std::make_unique<GrammarImpl>(std::move(second_pass_elements)); + } + + private: + std::unique_ptr<line::Reader> reader_; + std::vector<std::string> const& character_classes_; + src::Errors& errors_; +}; + +} // namespace + +std::unique_ptr<Grammar> load(std::unique_ptr<io::Reader> reader, + std::vector<std::string> const& character_classes, + src::Errors& errors) { + GrammarLoader loader(std::move(reader), character_classes, errors); + return loader.load(); +} + +} // namespace grammar |
