diff options
Diffstat (limited to 'sax/src/sax_processor.cc')
| -rw-r--r-- | sax/src/sax_processor.cc | 1098 |
1 files changed, 1089 insertions, 9 deletions
diff --git a/sax/src/sax_processor.cc b/sax/src/sax_processor.cc index ea9f753..afc9d3b 100644 --- a/sax/src/sax_processor.cc +++ b/sax/src/sax_processor.cc @@ -1,18 +1,41 @@ #include "sax_processor.hh" -#include "sax_decoder.hh" +#include <iostream> + +#include "buffer.hh" +#include "guessing_decoder.hh" #include "processor.hh" +#include "sax_attributes.hh" +#include "sax_decoder.hh" +#include "sax_decoder_factory.hh" +#include "sax_delegate.hh" +#include "utf8.hh" +#include "utf_error.hh" #include "utils.hh" #include <algorithm> +#include <cassert> +#include <charconv> +#include <format> +#include <map> #include <optional> #include <utility> +#include <vector> + +using namespace std::string_view_literals; namespace modxml { namespace sax { namespace { +constexpr std::size_t kDefaultBufferSize = 8192; +constexpr std::size_t kMinBufferSize = 128; + +inline bool is_digit(char c) { + return c >= '0' && c <= '9'; +} + // 2.2 Characters // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] @@ -75,12 +98,185 @@ inline bool is_namechar(uint32_t c) { (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040); } -/* [5] Name ::= NameStartChar (NameChar)* +/* +[5] Name ::= NameStartChar (NameChar)* [6] Names ::= Name (#x20 Name)* [7] Nmtoken ::= (NameChar)+ [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* */ +inline bool ascii_lowercase(char c) { + return (c >= 'A' & c <= 'Z') ? (c | 0x20) : c; +} + +bool eq_lowercase(std::string_view a, std::string_view b) { + if (a.size() != b.size()) + return false; + for (std::size_t i = 0; i < a.size(); ++i) + if (ascii_lowercase(a[i]) != b[i]) + return false; + return true; +} + +inline std::string_view make_string_view(std::span<uint8_t const> span) { + return std::string_view(reinterpret_cast<char const*>(span.data()), + span.size()); +} + +class Entities { + public: + Entities() { + data_.emplace("lt", "<"); + data_.emplace("gt", ">"); + data_.emplace("amp", "&"); + data_.emplace("apos", "'"); + data_.emplace("quot", "\""); + } + + std::optional<std::string> get(std::string const& entity) const { + if (entity.empty()) + return std::nullopt; + if (entity.front() == '#') { + if (entity.size() == 1) + return std::nullopt; + int base; + char const* start; + char const* end = entity.data() + entity.size(); + if (entity[1] == 'x') { + start = entity.data() + 2; + base = 16; + } else { + start = entity.data() + 1; + base = 10; + } + uint32_t value; + auto [ptr, ec] = std::from_chars(start, end, value, base); + if (ec == std::errc() && ptr == end) { + uint8_t tmp[4]; + std::size_t offset = 0; + utf::write8(value, tmp, offset); + return std::string(reinterpret_cast<char*>(tmp), offset); + } + } + auto it = data_.find(entity); + if (it == data_.end()) + return std::nullopt; + return it->second; + } + + private: + std::map<std::string, std::string> data_; +}; + +bool deamp(Entities const& entities, std::string& str, std::size_t last = 0) { + while (true) { + auto next = str.find('&', last); + if (next == std::string::npos) + break; + next += 1; + auto semicolon = str.find(';', next); + if (semicolon == std::string::npos) + return false; + auto replacement = entities.get(str.substr(next, semicolon - next)); + if (!replacement.has_value()) + return false; + } + return true; +} + +std::optional<std::string> unquote(Entities const& entities, + std::string_view quoted) { + assert(quoted.size() >= 2); + assert(quoted.front() == quoted.back()); + std::string ret(quoted.substr(1, quoted.size() - 2)); + if (deamp(entities, ret)) + return ret; + return std::nullopt; +} + +std::optional<std::string_view> unquote_if_needed(Entities const& entities, + std::string_view quoted, + std::string& tmp) { + assert(quoted.size() >= 2); + assert(quoted.front() == quoted.back()); + auto input = quoted.substr(1, quoted.size() - 2); + auto index = input.find('&'); + if (index == std::string_view::npos) + return input; + tmp.assign(input); + if (deamp(entities, tmp, index)) + return tmp; + return std::nullopt; +} + +class AttributesImpl : public Attributes { + public: + AttributesImpl() = default; + + bool init(Entities const& entities, + std::span<const uint8_t> data, + std::vector<size_t> const& offsets, + std::size_t first) { + std::size_t a = first; + attr_.reserve((offsets.size() - first) / 4); + while (a + 4 <= offsets.size()) { + auto name = make_string_view(data.subspan(offsets[a], offsets[a + 1])); + std::string tmp; + auto value = unquote_if_needed( + entities, + make_string_view(data.subspan(offsets[a + 2], offsets[a + 3])), + tmp); + if (!value.has_value()) + return false; + if (tmp.empty()) { + attr_.emplace_back(name, *value); + } else { + attr_.emplace_back(name, *value, std::move(tmp)); + } + a += 4; + } + return true; + } + + iterator begin() const override { + return Iterator(this, 0); + } + + iterator end() const override { + return Iterator(this, attr_.size()); + } + + std::size_t size() const override { + return attr_.size(); + } + + Attribute const& at(std::size_t index) const override { + return attr_[index]; + } + + private: + class Iterator : public iterator { + public: + Iterator(Attributes const* attributes, std::size_t index) + : iterator(attributes, index) {} + }; + + struct AttributeImpl : public Attribute { + AttributeImpl(std::string_view name, std::string_view value) + : Attribute(name, value) {} + + AttributeImpl(std::string_view name, std::string_view value, + std::string&& tmp) + : Attribute(name, value), tmp_(std::move(tmp)) {} + + private: + std::string tmp_; + }; + + std::span<const uint8_t> data_; + std::vector<AttributeImpl> attr_; +}; + class ProcessorImpl : public Processor { public: ProcessorImpl(std::shared_ptr<Delegate> delegate, @@ -91,15 +287,898 @@ class ProcessorImpl : public Processor { : delegate_(std::move(delegate)), decoder_factory_(std::move(decoder_factory)), decoder_(std::move(decoder)), - default_buffer_size_(default_buffer_size), - max_buffer_size_(max_buffer_size) {} + forced_decoder_(decoder_), + buffer_(make_buffer(default_buffer_size, max_buffer_size)) { + if (!decoder_) + decoder_ = create_guessing_decoder(); + + expect_document(); + } + + std::size_t process(std::span<uint8_t const> data, + std::size_t offset) override { + cmds_.emplace_back(Command::FILL_BUFFER, Count::ZERO_OR_ONE); + + std::size_t consumed = 0; + + while (true) { + if (cmds_.empty()) { + if (!buffer_->empty()) { + std::cerr << make_string_view(buffer_->rspan()) << std::endl; + delegate_->error("Extra data at end"); + } + return consumed; + } + + auto current = cmds_.back(); + auto const old_size = cmds_.size(); + cmds_.pop_back(); + Process ret; + switch (current.command) { + case Command::FILL_BUFFER: + ret = fill_buffer(data, offset, consumed); + break; + case Command::MISC: + ret = process_misc(current); + break; + case Command::SPACE: + ret = process_space(current); + break; + case Command::ELEMENT: + ret = process_element(current); + break; + case Command::COMMENT: + ret = process_comment(current); + break; + case Command::PROCESSING_INSTRUCTION: + ret = process_processing_instruction(current); + break; + case Command::XMLDECL: + ret = process_xmldecl(current); + break; + case Command::ATTRIBUTE: + ret = process_attribute(current); + break; + case Command::NAME: + ret = process_name(current); + break; + case Command::ATTRIBUTE_VALUE: + ret = process_attribute_value(current); + break; + case Command::EQUAL: + ret = process_equal(current); + break; + case Command::START_OR_EMPTY_TAG: + ret = process_start_or_empty_tag(current); + break; + case Command::END_TAG: + ret = process_end_tag(current); + break; + } + + switch (ret) { + case Process::NEED_MORE: + case Process::ERROR: + cmds_.push_back(current); + assert(cmds_.size() == old_size); + return consumed; + case Process::CONTINUE: + break; + } + } + } + + uint64_t line() const override { return line_; } + + uint64_t column() const override { return column_; } private: + enum class Process { + NEED_MORE, + ERROR, + CONTINUE, + }; + + enum class Match { + FULL_MATCH, + PARTIAL_MATCH, + NO_MATCH, + }; + + enum class Command { + FILL_BUFFER, + + ATTRIBUTE, + ATTRIBUTE_VALUE, + COMMENT, + ELEMENT, + END_TAG, + EQUAL, + MISC, + NAME, + PROCESSING_INSTRUCTION, + SPACE, + START_OR_EMPTY_TAG, + XMLDECL, + }; + + enum class Count { + ONE, + ONE_OR_MANY, + ZERO_OR_ONE, + ZERO_OR_MANY, + }; + + struct CommandItem { + Command const command; + Count const count; + std::size_t offset; + + CommandItem(Command command, Count count, std::size_t offset = 0) + : command(command), count(count), offset(offset) {} + }; + + struct StackItem { + std::vector<std::size_t> offsets; + }; + + Process fill_buffer(std::span<uint8_t const> data, + std::size_t offset, + std::size_t& consumed) { + if (offset >= data.size()) + return Process::NEED_MORE; + + std::size_t tmp = offset; + auto wspan = buffer_->wspan(4); + switch (decoder_->decode(data, tmp, wspan, consumed)) { + case Decoder::State::GOOD: + break; + case Decoder::State::NEED_MORE: + return Process::NEED_MORE; + case Decoder::State::INVALID: + delegate_->error("Invalid data"); + return Process::ERROR; + } + buffer_->commit(consumed); + return Process::CONTINUE; + } + + void expect_document() { + // document := prolog element Misc* + expect_misc(Count::ZERO_OR_MANY); + expect_element(Count::ONE); + expect_prolog(); + } + + void expect_misc(Count count) { + cmds_.emplace_back(Command::MISC, count); + } + + void expect_element(Count count) { + // element ::= EmptyElemTag | STag content ETag + cmds_.emplace_back(Command::START_OR_EMPTY_TAG, count); + } + + void expect_end_tag(Count count) { + cmds_.emplace_back(Command::END_TAG, count); + } + + void expect_prolog() { + // prolog := XMLDecl? Misc* (doctypedecl Misc*)? + expect_misc(Count::ZERO_OR_MANY); + expect_doctypedecl(Count::ZERO_OR_ONE); + expect_misc(Count::ZERO_OR_MANY); + expect_xmldecl(Count::ZERO_OR_ONE); + } + + void expect_xmldecl(Count count) { + cmds_.emplace_back(Command::XMLDECL, count); + } + + void expect_doctypedecl(Count) { + // TODO + } + + void expect_comment(Count count, std::size_t start_offset = 0) { + // Comment should never be more than one, should be MISC that is repeated. + assert(count == Count::ONE); + cmds_.emplace_back(Command::COMMENT, count, start_offset); + } + + void expect_content(Count) { + // TODO + } + + void expect_pi(Count count, std::size_t start_offset = 0) { + // PI should never be more than one, should be MISC that is repeated. + assert(count == Count::ONE); + cmds_.emplace_back(Command::PROCESSING_INSTRUCTION, count, start_offset); + } + + void expect_space(Count count) { + // There is not way to have SS as S is continous, so we should never + // ask for more than one or zero. + assert(count == Count::ZERO_OR_ONE || count == Count::ONE); + cmds_.emplace_back(Command::SPACE, count); + } + + void expect_attribute(Count count) { + switch (count) { + case Count::ONE_OR_MANY: + cmds_.emplace_back(Command::ATTRIBUTE, Count::ZERO_OR_MANY); + case Count::ONE: + // Attribute ::= Name Eq AttValue + expect_attribute_value(Count::ONE); + expect_equal(Count::ONE); + expect_name(Count::ONE); + expect_space(Count::ONE); + break; + case Count::ZERO_OR_ONE: + case Count::ZERO_OR_MANY: + cmds_.emplace_back(Command::ATTRIBUTE, count); + break; + } + } + + void expect_attribute_value(Count count) { + cmds_.emplace_back(Command::ATTRIBUTE_VALUE, count); + } + + void expect_equal(Count count) { + // Eq ::= S? '=' S? + expect_space(Count::ZERO_OR_ONE); + cmds_.emplace_back(Command::EQUAL, count); + expect_space(Count::ZERO_OR_ONE); + } + + void expect_name(Count count) { + cmds_.emplace_back(Command::NAME, count); + } + + Process process_misc(CommandItem const& item) { + // Misc := Comment | PI | S + assert(item.offset == 0); + + switch (match("<!--")) { + case Match::FULL_MATCH: + add_if_more(item); + expect_comment(Count::ONE, 3); + return Process::CONTINUE; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + break; + } + + switch (match("<?")) { + case Match::FULL_MATCH: + add_if_more(item); + expect_pi(Count::ONE, 2); + return Process::CONTINUE; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + break; + } + + switch (match_s()) { + case Match::FULL_MATCH: + add_if_more(item); + expect_space(Count::ONE); + return Process::CONTINUE; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + break; + } + + return no_match(item); + } + + Process process_attribute(CommandItem& item) { + // This actually parses (S Attribute)* when followed by S? + // for Attribute parsing see expect_attribute() + // So we need to figure out if the S means start of attribute + // or just an S. We do this by checking if the first non-S is + // a namestart or something else. We consume the S. + uint32_t last_char; + auto ret = consume_space(item.offset, last_char); + if (ret != Process::CONTINUE) + return ret; + + // No S, cannot be followed by an attribute then. + if (item.offset == 0) + return no_match(item); + + // First character after S isn't a valid first character of a name, + // cannot be followed by an attribute then. + if (!is_namestartchar(last_char)) + return no_match(item); + + expect_attribute_value(Count::ONE); + expect_equal(Count::ONE); + expect_name(Count::ONE); + return Process::CONTINUE; + } + + Process process_equal(CommandItem const& item) { + // Eq ::= S? '=' S? + // Spacing added by expect_equal + switch (match_consume("=")) { + case Match::FULL_MATCH: + add_if_more(item); + return Process::CONTINUE; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + return no_match(item); + } + } + + Process process_name(CommandItem& item) { + // Name ::= NameStartChar (NameChar)* + auto data = buffer_->rspan(item.offset + 4); + while (true) { + std::size_t tmp = item.offset; + auto c = utf::read8(data, tmp); + if (c == utf::NEED_MORE) + return Process::NEED_MORE; + if (c == utf::INVALID || !valid_char(c)) + return invalid_char(data, tmp); + if (item.offset == 0) { + if (!is_namestartchar(c)) + return no_match(item); + } else { + if (!is_namechar(c)) + break; + } + item.offset = tmp; + } + + assert(!stack_.empty()); + auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get()); + stack_.back().offsets.push_back(read_view->consumed()); + stack_.back().offsets.push_back(item.offset); + buffer_->consume(item.offset); + return Process::CONTINUE; + } + + Process process_attribute_value(CommandItem& item) { + // AttValue ::= '"' ([^<&"] | Reference)* '"' + // | "'" ([^<&'] | Reference)* "'" + + uint32_t end_char; + auto data = buffer_->rspan(item.offset + 4); + + if (item.offset == 0) { + std::size_t tmp = item.offset; + auto c = utf::read8(data, tmp); + if (c == utf::NEED_MORE) + return Process::NEED_MORE; + if (c == utf::INVALID || !valid_char(c)) + return invalid_char(data, tmp); + if (c != '"' && c != '\'') + return no_match(item); + item.offset = tmp; + end_char = c; + } else { + assert(!data.empty()); + end_char = data[0]; // ok as both " and ' are ASCII + } + + while (true) { + auto c = utf::read8(data, item.offset); + if (c == utf::NEED_MORE) + return Process::NEED_MORE; + if (c == utf::INVALID || !valid_char(c)) + return invalid_char(data, item.offset); + if (c == end_char) + break; + // TODO: Should we validate reference already here or do we let + // unquoute take care of that? As Reference can't contain end_char + // only checking for end_char is safe here. + } + + assert(!stack_.empty()); + auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get()); + stack_.back().offsets.push_back(read_view->consumed()); + stack_.back().offsets.push_back(item.offset); + buffer_->consume(item.offset); + return Process::CONTINUE; + } + + Process process_comment(CommandItem& item) { + if (item.offset == 0) { + switch (match_consume("<!--")) { + case Match::FULL_MATCH: + item.offset += 3; + break; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + return no_match(item); + } + } + + auto match = find("-->", item.offset); + switch (match) { + case Match::FULL_MATCH: { + auto data = buffer_->rspan(item.offset); + assert(data.size() >= item.offset); + delegate_->comment( + make_string_view(data.subspan(3, item.offset - 3))); + buffer_->consume(item.offset + 3); + return Process::CONTINUE; + } + case Match::NO_MATCH: + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + } + } + + Process process_processing_instruction(CommandItem& item) { + if (item.offset == 0) { + switch (match_consume("<?")) { + case Match::FULL_MATCH: + item.offset += 2; + break; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + return no_match(item); + } + } + + // TODO + delegate_->error("PI not supported"); + return Process::ERROR; + } + + void add_to_stack(CommandItem const& item, std::size_t offset) { + cmds_.emplace_back(item.command, item.count, offset); + stack_.emplace_back(); + buffer_ = make_read_view_buffer(std::move(buffer_)); + buffer_->consume(offset); + } + + std::size_t pop_stack(std::vector<std::size_t>& attr) { + assert(!stack_.empty()); + std::swap(attr, stack_.back().offsets); + + auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get()); + auto consumed = read_view->consumed(); + + buffer_ = read_view->release(); + stack_.pop_back(); + + return consumed; + } + + Process process_xmldecl(CommandItem const& item) { + // XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' + if (item.offset == 0) { + switch (match("<?xml")) { + case Match::FULL_MATCH: + add_to_stack(item, /* offset */ 5); + expect_space(Count::ZERO_OR_ONE); + // Parsing as generic "Attribute" here and doing validation later. + expect_attribute(Count::ONE_OR_MANY); + return Process::CONTINUE; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + return no_match(item); + } + } + + assert(item.offset == 5); + + // Remember that this is still reading for the read view buffer. + switch (match_consume("?>")) { + case Match::FULL_MATCH: + break; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + delegate_->error(std::format("Expected end of {}", + command_name(item.command))); + return Process::ERROR; + } + + std::vector<std::size_t> attr; + auto const consumed = pop_stack(attr); + + // Now we're back to the real buffer + auto data = buffer_->rspan(consumed); + std::size_t a = 0; + + if (a + 4 <= attr.size() && + make_string_view(data.subspan(attr[a + 0], + attr[a + 1])) == "version") { + auto version = make_string_view(data.subspan(attr[a + 2] + 1, + attr[a + 3] - 2)); + if (!valid_version(version)) { + delegate_->error(std::format("Unsupported xmldecl version, {}", + version)); + return Process::ERROR; + } + a += 4; + } else { + // No version + delegate_->error("Invalid xmldecl, must have a version attribute first."); + return Process::ERROR; + } + + if (a + 4 <= attr.size() && + make_string_view(data.subspan(attr[a + 0], + attr[a + 1])) == "encoding") { + auto encoding = make_string_view(data.subspan(attr[a + 2] + 1, + attr[a + 3] - 2)); + if (forced_decoder_) { + // encoding value is ignored + // TODO: Should we check that it is valid anyway? + } else { + auto decoder = pick_decoder_for_encoding(encoding, nullptr); + if (!decoder && decoder_factory_) + decoder = decoder_factory_->create(encoding); + if (!decoder) { + delegate_->error(std::format("Unknown encoding {}", encoding)); + return Process::ERROR; + } + std::swap(decoder_, decoder); + // TODO: Re-decode the rest of the buffer? + } + a += 4; + } + + if (a + 4 <= attr.size() && + make_string_view(data.subspan(attr[a + 0], + attr[a + 1])) == "standalone") { + auto sd = make_string_view(data.subspan(attr[a + 2] + 1, + attr[a + 3] - 2)); + if (sd == "yes") { + // TODO: Handle standalone == yes + } else if (sd == "no") { + // TODO: Handle standalone == no + } else { + delegate_->error(std::format( + "Invalid xmldecl, standalone attribute has unsupported value, {}", + sd)); + return Process::ERROR; + } + a += 4; + } + + if (a < attr.size()) { + delegate_->error( + std::format("Invalid xmldecl, unknown attribute, {}", + make_string_view(data.subspan(attr[a + 0], + attr[a + 1])))); + return Process::ERROR; + } + + buffer_->consume(consumed); + return Process::CONTINUE; + } + + Process process_start_or_empty_tag(CommandItem const& item) { + // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' + // STag ::= '<' Name (S Attribute)* S? '>' + if (item.offset == 0) { + switch (match("<")) { + case Match::FULL_MATCH: + add_to_stack(item, /* offset */ 1); + expect_space(Count::ZERO_OR_ONE); + expect_attribute(Count::ZERO_OR_MANY); + expect_name(Count::ONE); + return Process::CONTINUE; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + return no_match(item); + } + } + + assert(item.offset == 1); + + bool empty_tag; + + // Remember that this is still reading for the read view buffer. + switch (match_consume("/>")) { + case Match::FULL_MATCH: + empty_tag = true; + break; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + switch (match_consume(">")) { + case Match::FULL_MATCH: + empty_tag = false; + break; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + delegate_->error(std::format("Expected end of {}", + command_name(item.command))); + return Process::ERROR; + } + break; + } + + std::vector<std::size_t> attr; + auto const consumed = pop_stack(attr); + + // Now we're back to the real buffer + auto data = buffer_->rspan(consumed); + + assert(attr.size() >= 2); + auto name = make_string_view(data.subspan(attr[0], attr[1])); + + AttributesImpl attributes; + if (!attributes.init(entities_, data, std::move(attr), 2)) { + delegate_->error("Invalid references in attribute values"); + return Process::ERROR; + } + + add_if_more(item); + + if (empty_tag) { + delegate_->empty_element(name, attributes); + } else { + delegate_->start_element(name, attributes); + expect_end_tag(Count::ONE); + expect_content(Count::ONE); + } + + buffer_->consume(consumed); + return Process::CONTINUE; + } + + Process process_end_tag(CommandItem const& item) { + // ETag ::= '</' Name S? '>' + if (item.offset == 0) { + switch (match("</")) { + case Match::FULL_MATCH: + add_to_stack(item, /* offset */ 2); + expect_space(Count::ZERO_OR_ONE); + expect_name(Count::ONE); + return Process::CONTINUE; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + return no_match(item); + } + } + + assert(item.offset == 1); + + // Remember that this is still reading for the read view buffer. + switch (match_consume(">")) { + case Match::FULL_MATCH: + break; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + delegate_->error(std::format("Expected end of {}", + command_name(item.command))); + return Process::ERROR; + } + + std::vector<std::size_t> attr; + auto const consumed = pop_stack(attr); + + // Now we're back to the real buffer + auto data = buffer_->rspan(consumed); + + assert(attr.size() == 2); + auto name = make_string_view(data.subspan(attr[0], attr[1])); + + add_if_more(item); + + delegate_->end_element(name); + + buffer_->consume(consumed); + return Process::CONTINUE; + } + + static bool valid_version(std::string_view version) { + if (version.size() < 3) + return false; + if (!version.starts_with("1.")) + return false; + for (std::size_t i = 2; i < version.size(); ++i) { + if (!is_digit(version[i])) + return false; + } + return true; + } + + Process process_element(CommandItem& item) { + // TODO + delegate_->error("Element is not yet supported"); + return Process::ERROR; + } + + Process consume_space(std::size_t& count, uint32_t& last_char) { + auto data = buffer_->rspan(4); + std::size_t consumed = 0; + while (true) { + std::size_t offset = consumed; + auto c = utf::read8(data, offset); + if (c == utf::NEED_MORE) { + buffer_->consume(consumed); + return Process::NEED_MORE; + } + if (c == utf::INVALID || !valid_char(c)) + return invalid_char(data, offset); + if (!is_ws(c)) { + last_char = c; + buffer_->consume(consumed); + return Process::CONTINUE; + } + ++count; + handle_ws(c); + consumed = offset; + } + } + + Process process_space(CommandItem& item) { + // S ::= (#x20 | #x9 | #xD | #xA)+ + // item.offset is only used to count spaces. We consume each space as it + // is found so no offset in buffer. + uint32_t unused; + auto ret = consume_space(item.offset, unused); + if (ret != Process::CONTINUE) + return ret; + + if (item.offset == 0) + return no_match(item); + + add_if_more(item); + return Process::CONTINUE; + } + + void add_if_more(CommandItem const& item) { + switch (item.count) { + case Count::ONE: + break; + case Count::ONE_OR_MANY: + cmds_.emplace_back(item.command, Count::ZERO_OR_MANY); + break; + case Count::ZERO_OR_ONE: + break; + case Count::ZERO_OR_MANY: + cmds_.emplace_back(item.command, item.count); + } + } + + Match find(std::string_view str, std::size_t& offset) { + auto data = buffer_->rspan(offset + str.size()); + std::size_t i = 0; + while (offset < data.size()) { + if (str[i] == data[offset]) { + ++i; + if (i == str.size()) { + offset -= i; + return Match::FULL_MATCH; + } + } else { + i = 0; + } + ++offset; + } + if (i > 0) { + offset -= i; + return Match::PARTIAL_MATCH; + } + return Match::NO_MATCH; + } + + Match match(std::string_view str, std::size_t offset = 0) { + auto data = buffer_->rspan(offset + str.size()); + if (data.size() <= offset) + return Match::PARTIAL_MATCH; + auto const avail = std::min(str.size(), data.size() - offset); + for (std::size_t i = 0; i < avail; ++i) { + if (str[i] != data[offset + i]) + return Match::NO_MATCH; + } + if (avail < str.size()) + return Match::PARTIAL_MATCH; + return Match::FULL_MATCH; + } + + Match match_consume(std::string_view str) { + auto ret = match(str); + if (ret == Match::FULL_MATCH) + buffer_->consume(str.size()); + return ret; + } + + Match match_s() { + auto data = buffer_->rspan(4); + std::size_t offset = 0; + auto c = utf::read8(data, offset); + if (c == utf::NEED_MORE) + return data.size() == 0 ? Match::PARTIAL_MATCH : Match::NO_MATCH; + if (c == utf::INVALID) + return Match::NO_MATCH; + if (!valid_char(c) || !is_ws(c)) + return Match::NO_MATCH; + return Match::FULL_MATCH; + } + + Process no_match(CommandItem const& item) { + switch (item.count) { + case Count::ONE: + case Count::ONE_OR_MANY: + delegate_->error(std::format("Expected {}", + command_name(item.command))); + return Process::ERROR; + case Count::ZERO_OR_ONE: + case Count::ZERO_OR_MANY: + break; + } + return Process::CONTINUE; + } + + void handle_ws(uint32_t c) { + if (c == '\n') { + ++line_; + column_ = 0; + } else { + ++column_; + } + } + + Process invalid_char(std::span<uint8_t const> data, std::size_t offset) { + delegate_->error(std::format("Invalid char {:02x}", data[offset])); + return Process::ERROR; + } + + static std::string_view command_name(Command command) { + switch (command) { + case Command::MISC: + return "misc"sv; + case Command::FILL_BUFFER: + return "more data"sv; + case Command::ELEMENT: + return "element"sv; + case Command::SPACE: + return "whitespace"sv; + case Command::COMMENT: + return "comment"sv; + case Command::PROCESSING_INSTRUCTION: + return "processing instruction"sv; + case Command::XMLDECL: + return "xml declaration"sv; + case Command::ATTRIBUTE: + return "attribute"sv; + case Command::ATTRIBUTE_VALUE: + return "attribute value"sv; + case Command::NAME: + return "name"sv; + case Command::EQUAL: + return "equal sign (=)"sv; + case Command::START_OR_EMPTY_TAG: + return "element"sv; + case Command::END_TAG: + return "end tag"sv; + } + assert(false); + return {}; + } + std::shared_ptr<Delegate> delegate_; std::shared_ptr<DecoderFactory> decoder_factory_; std::unique_ptr<Decoder> decoder_; - std::size_t default_buffer_size_; - std::size_t max_buffer_size_; + bool const forced_decoder_; + std::unique_ptr<Buffer> buffer_; + Entities entities_; + std::vector<CommandItem> cmds_; + std::vector<StackItem> stack_; + uint64_t line_{1}; + uint64_t column_{0}; }; } // namespace @@ -117,9 +1196,9 @@ std::unique_ptr<Processor> create_processor( decoder_factory.get()); } - std::size_t default_buffer_size = 8192; + std::size_t default_buffer_size = kDefaultBufferSize; if (opt_default_buffer_size.has_value()) - default_buffer_size = std::max(static_cast<std::size_t>(128), + default_buffer_size = std::max(kMinBufferSize, opt_default_buffer_size.value()); // This value is documented in public headers. Do NOT change. std::size_t max_buffer_size = 10 * 1024 * 1024; @@ -136,7 +1215,8 @@ std::unique_ptr<Processor> create_processor( max_buffer_size); } -std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate) { +std::unique_ptr<Processor> +Processor::create(std::shared_ptr<Delegate> delegate) { return create_processor(std::move(delegate), nullptr, std::nullopt, std::nullopt, std::nullopt); } |
