summaryrefslogtreecommitdiff
path: root/src/grammar.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/grammar.cc')
-rw-r--r--src/grammar.cc268
1 files changed, 268 insertions, 0 deletions
diff --git a/src/grammar.cc b/src/grammar.cc
new file mode 100644
index 0000000..25c4d64
--- /dev/null
+++ b/src/grammar.cc
@@ -0,0 +1,268 @@
+#include "grammar.hh"
+
+#include "errors.hh"
+#include "line.hh"
+#include "location.hh"
+#include "str.hh"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <format>
+#include <functional>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace grammar {
+
+namespace {
+
+class GrammarImpl : public Grammar {
+ public:
+ explicit GrammarImpl(std::vector<std::unique_ptr<Element>> elements)
+ : elements_(std::move(elements)) {}
+
+ [[nodiscard]]
+ Element const& root() const override {
+ return *elements_[0];
+ }
+
+ private:
+ std::vector<std::unique_ptr<Element>> elements_;
+};
+
+struct FirstPassElement {
+ src::Location loc;
+ std::vector<std::string> definitions;
+
+ explicit FirstPassElement(src::Location loc) : loc(loc) {}
+};
+
+class GrammarLoader {
+ public:
+ GrammarLoader(std::unique_ptr<io::Reader> reader,
+ std::vector<std::string> const& character_classes,
+ src::Errors& errors)
+ : reader_(line::open(std::move(reader))),
+ character_classes_(character_classes),
+ errors_(errors) {}
+
+ std::unique_ptr<Grammar> load() {
+ // Read whole file in a first pass, before parsing definitions.
+ std::map<std::string, FirstPassElement> first_pass_elements;
+ FirstPassElement* last_element = nullptr;
+
+ std::map<std::string_view, uint8_t, std::less<>> cc_lookup;
+ for (uint8_t i = 0; i < static_cast<uint8_t>(character_classes_.size());
+ ++i) {
+ cc_lookup.emplace(character_classes_[i], i);
+ }
+
+ while (true) {
+ auto line = reader_->read();
+ auto loc = src::Location(reader_->number(), 0);
+ if (!line.has_value()) {
+ if (line.error() == io::ReadError::Eof)
+ break;
+ errors_.err(loc, "Error reading");
+ return nullptr;
+ }
+ if (line.value().empty() || line.value().front() == '#')
+ continue;
+ if (line.value().front() == ' ') {
+ // Continue on last element
+ size_t i = 1;
+ while (i < line.value().size() && line.value()[i] == ' ')
+ ++i;
+ if (i == line.value().size()) {
+ errors_.err(loc, "Unexpected line, only spaces");
+ continue;
+ }
+ if (last_element == nullptr) {
+ errors_.err(loc, "Expected element before indented lines");
+ continue;
+ }
+ last_element->definitions.emplace_back(line.value().substr(i));
+ } else {
+ // New element
+ if (line.value().back() != ':') {
+ errors_.err(
+ loc,
+ "Unexpected line, not indented but also not ending with a ':'");
+ continue;
+ }
+
+ auto name = line.value().substr(0, line.value().size() - 1);
+ if (cc_lookup.contains(name)) {
+ errors_.warn(
+ loc, std::format("Element {} overrides character class", name));
+ }
+ auto pair = first_pass_elements.emplace(name, FirstPassElement(loc));
+ if (!pair.second) {
+ errors_.err(loc, std::format("Duplicate element {}", name));
+ }
+ last_element = &pair.first->second;
+ }
+ }
+
+ if (first_pass_elements.empty()) {
+ errors_.err(src::Location(reader_->number(), 0), "No elements found");
+ return nullptr;
+ }
+
+ std::vector<std::unique_ptr<Element>> second_pass_elements;
+ std::map<std::string_view, size_t, std::less<>> second_pass_lookup;
+ for (auto const& pair : first_pass_elements) {
+ auto element = std::make_unique<Element>();
+ element->name = pair.first;
+ second_pass_lookup.emplace(element->name, second_pass_elements.size());
+ second_pass_elements.emplace_back(std::move(element));
+ }
+
+ auto it = second_pass_elements.begin();
+ for (auto const& pair : first_pass_elements) {
+ auto const& element = *it++;
+ if (pair.second.definitions.empty()) {
+ errors_.err(pair.second.loc,
+ std::format("No definitions for {}", pair.first));
+ continue;
+ }
+ std::vector<std::string_view> in_symbols;
+ for (auto const& in_definition : pair.second.definitions) {
+ str::split(in_definition, in_symbols);
+
+ std::vector<Symbol> out_symbols;
+ bool exclude = false;
+ bool expect_not = false;
+ for (auto in_symbol : in_symbols) {
+ Symbol out_symbol;
+ if (exclude) {
+ if (in_symbol == "or")
+ continue;
+ out_symbol.optional = Symbol::Optional::kExcluded;
+ } else if (expect_not) {
+ expect_not = false;
+ if (in_symbol == "not") {
+ exclude = true;
+ } else {
+ errors_.err(pair.second.loc, "but is not followed by not");
+ }
+ continue;
+ }
+ if (in_symbol == "but") {
+ expect_not = true;
+ continue;
+ }
+ if (in_symbol.front() == '{' && in_symbol.back() == '}') {
+ if (exclude) {
+ errors_.err(pair.second.loc,
+ "Optional and exclude doesn't work together");
+ } else {
+ out_symbol.optional = Symbol::Optional::kZeroOrMore;
+ }
+ in_symbol = in_symbol.substr(1, in_symbol.size() - 2);
+ } else if (in_symbol.front() == '[' && in_symbol.back() == ']') {
+ if (exclude) {
+ errors_.err(pair.second.loc,
+ "Optional and exclude doesn't work together");
+ } else {
+ out_symbol.optional = Symbol::Optional::kZeroOrOne;
+ }
+ in_symbol = in_symbol.substr(1, in_symbol.size() - 2);
+ }
+ auto it2 = second_pass_lookup.find(in_symbol);
+ if (it2 != second_pass_lookup.end()) {
+ out_symbol.type = Symbol::Type::kNonTerminal;
+ out_symbol.element = second_pass_elements[it2->second].get();
+ } else {
+ auto it3 = cc_lookup.find(in_symbol);
+ if (it3 != cc_lookup.end()) {
+ out_symbol.type = Symbol::Type::kCharacterClass;
+ out_symbol.char_class = it3->second;
+ } else {
+ out_symbol.type = Symbol::Type::kTerminal;
+ out_symbol.value = in_symbol;
+ }
+ }
+ out_symbols.emplace_back(std::move(out_symbol));
+ }
+
+ if (expect_not) {
+ errors_.err(pair.second.loc, "but is not followed by not");
+ }
+
+ if (out_symbols.empty()) {
+ errors_.err(pair.second.loc, "no symbols found in definition");
+ continue;
+ }
+
+ element->definitions.emplace_back(
+ Definition{.symbols = std::move(out_symbols)});
+ }
+ }
+
+ // Find root and move it first (if needed)
+ std::vector<size_t> used(second_pass_elements.size(), 0);
+ for (auto const& element : second_pass_elements) {
+ for (auto const& definition : element->definitions) {
+ for (auto const& symbol : definition.symbols) {
+ switch (symbol.type) {
+ case Symbol::Type::kTerminal:
+ case Symbol::Type::kCharacterClass:
+ break;
+ case Symbol::Type::kNonTerminal:
+ used[second_pass_lookup.find(symbol.element->name)->second]++;
+ break;
+ }
+ }
+ }
+ }
+
+ std::optional<size_t> root_index;
+ for (size_t i = 0; i < used.size(); ++i) {
+ if (used[i] == 0) {
+ if (root_index.has_value()) {
+ errors_.warn(first_pass_elements.find(second_pass_elements[i]->name)
+ ->second.loc,
+ "Is not referenced but also not root");
+ } else {
+ root_index = i;
+ }
+ }
+ }
+
+ if (root_index.has_value()) {
+ if (root_index.value() != 0) {
+ std::swap(second_pass_elements[0],
+ second_pass_elements[root_index.value()]);
+ }
+ } else {
+ errors_.err(
+ first_pass_elements.find(second_pass_elements[0]->name)->second.loc,
+ "No root element found");
+ }
+
+ return std::make_unique<GrammarImpl>(std::move(second_pass_elements));
+ }
+
+ private:
+ std::unique_ptr<line::Reader> reader_;
+ std::vector<std::string> const& character_classes_;
+ src::Errors& errors_;
+};
+
+} // namespace
+
+std::unique_ptr<Grammar> load(std::unique_ptr<io::Reader> reader,
+ std::vector<std::string> const& character_classes,
+ src::Errors& errors) {
+ GrammarLoader loader(std::move(reader), character_classes, errors);
+ return loader.load();
+}
+
+} // namespace grammar