#include "sax_processor.hh" #include "buffer.hh" #include "guessing_decoder.hh" #include "processor.hh" #include "sax_attributes.hh" #include "sax_decoder.hh" #include "sax_decoder_factory.hh" #include "sax_delegate.hh" #include "utf8.hh" #include "utf_error.hh" #include "utils.hh" #include #include #include #include #include #include #include #include using namespace std::string_view_literals; namespace modxml { namespace sax { namespace { constexpr std::size_t kDefaultBufferSize = 8192; constexpr std::size_t kMinBufferSize = 128; inline bool is_digit(char c) { return c >= '0' && c <= '9'; } // 2.2 Characters // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] inline bool valid_char(uint32_t c) { // Assume valid unicode (U+0 - U+10ffff except surrogate blocks) if (c >= 0x20 && c <= 0xfffd) return true; if (c == 0x9 || c == 0xa || c == 0xd) return true; return c >= 0x10000; } // 2.3 Common Syntactic Constructs // [3] S ::= (#x20 | #x9 | #xD | #xA)+ inline bool is_ws(uint32_t c) { // Assume we already checked for valid_char. return c <= 0x20; } // [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] // [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] inline bool is_namestartchar(uint32_t c) { if (c < 0x41 /* A */) return c == 0x3a /* : */; if (c <= 0x5a /* Z */) return true; if (c < 0x61 /* a */) return c == 0x5f /* _ */; if (c <= 0x7a /* z */) return true; if (c < 0xc0) return false; if (c < 0x300) return c != 0xd7 && c != 0xf7; if (c > 0x37d && c < 0x37f) return false; if (c > 0x1fff && c < 0x200c) return false; if (c > 0x200d && c < 0x2070) return false; if (c > 0x218f && c < 0x2c00) return false; if (c > 0x2fef && c < 0x3001) return false; // Already valid_char so don't check for surrogate pair here. if (c > 0xdfff && c < 0xf900) return false; if (c > 0xfdcf && c < 0xfdf0) return false; if (c > 0xfffd && c < 0x10000) return false; return true; } inline bool is_namechar(uint32_t c) { return is_namestartchar(c) || (c >= 0x2d /* - */ && c <= 0x2e /* . */) || (c >= 0x30 /* 0 */ && c <= 0x39 /* 9 */) || (c == 0xb7) || (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040); } /* [5] Name ::= NameStartChar (NameChar)* [6] Names ::= Name (#x20 Name)* [7] Nmtoken ::= (NameChar)+ [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* */ inline bool ascii_lowercase(char c) { return (c >= 'A' & c <= 'Z') ? (c | 0x20) : c; } bool eq_lowercase(std::string_view a, std::string_view b) { if (a.size() != b.size()) return false; for (std::size_t i = 0; i < a.size(); ++i) if (ascii_lowercase(a[i]) != b[i]) return false; return true; } inline std::string_view make_string_view(std::span span) { return std::string_view(reinterpret_cast(span.data()), span.size()); } class Entities { public: Entities() { data_.emplace("lt", "<"); data_.emplace("gt", ">"); data_.emplace("amp", "&"); data_.emplace("apos", "'"); data_.emplace("quot", "\""); } std::optional get(std::string const& entity) const { if (entity.empty()) return std::nullopt; if (entity.front() == '#') { if (entity.size() == 1) return std::nullopt; int base; char const* start; char const* end = entity.data() + entity.size(); if (entity[1] == 'x') { start = entity.data() + 2; base = 16; } else { start = entity.data() + 1; base = 10; } uint32_t value; auto [ptr, ec] = std::from_chars(start, end, value, base); if (ec == std::errc() && ptr == end) { uint8_t tmp[4]; std::size_t offset = 0; utf::write8(value, tmp, offset); return std::string(reinterpret_cast(tmp), offset); } } auto it = data_.find(entity); if (it == data_.end()) return std::nullopt; return it->second; } private: std::map data_; }; bool deamp(Entities const& entities, std::string& str, std::size_t last = 0) { while (true) { auto next = str.find('&', last); if (next == std::string::npos) break; next += 1; auto semicolon = str.find(';', next); if (semicolon == std::string::npos) return false; auto replacement = entities.get(str.substr(next, semicolon - next)); if (!replacement.has_value()) return false; } return true; } std::optional unquote(Entities const& entities, std::string_view quoted) { assert(quoted.size() >= 2); assert(quoted.front() == quoted.back()); std::string ret(quoted.substr(1, quoted.size() - 2)); if (deamp(entities, ret)) return ret; return std::nullopt; } std::optional unquote_if_needed(Entities const& entities, std::string_view quoted, std::string& tmp) { assert(quoted.size() >= 2); assert(quoted.front() == quoted.back()); auto input = quoted.substr(1, quoted.size() - 2); auto index = input.find('&'); if (index == std::string_view::npos) return input; tmp.assign(input); if (deamp(entities, tmp, index)) return tmp; return std::nullopt; } class AttributesImpl : public Attributes { public: AttributesImpl() = default; bool init(Entities const& entities, std::span data, std::vector const& offsets, std::size_t first) { std::size_t a = first; attr_.reserve((offsets.size() - first) / 4); while (a + 4 <= offsets.size()) { auto name = make_string_view(data.subspan(offsets[a], offsets[a + 1])); std::string tmp; auto value = unquote_if_needed( entities, make_string_view(data.subspan(offsets[a + 2], offsets[a + 3])), tmp); if (!value.has_value()) return false; if (tmp.empty()) { attr_.emplace_back(name, *value); } else { attr_.emplace_back(name, *value, std::move(tmp)); } a += 4; } return true; } iterator begin() const override { return Iterator(this, 0); } iterator end() const override { return Iterator(this, attr_.size()); } std::size_t size() const override { return attr_.size(); } Attribute const& at(std::size_t index) const override { return attr_[index]; } private: class Iterator : public iterator { public: Iterator(Attributes const* attributes, std::size_t index) : iterator(attributes, index) {} }; struct AttributeImpl : public Attribute { AttributeImpl(std::string_view name, std::string_view value) : Attribute(name, value) {} AttributeImpl(std::string_view name, std::string_view value, std::string&& tmp) : Attribute(name, value), tmp_(std::move(tmp)) {} private: std::string tmp_; }; std::span data_; std::vector attr_; }; class ProcessorImpl : public Processor { public: ProcessorImpl(std::shared_ptr delegate, std::shared_ptr decoder_factory, std::unique_ptr decoder, std::size_t default_buffer_size, std::size_t max_buffer_size) : delegate_(std::move(delegate)), decoder_factory_(std::move(decoder_factory)), decoder_(std::move(decoder)), forced_decoder_(decoder_), buffer_(make_buffer(default_buffer_size, max_buffer_size)) { if (!decoder_) decoder_ = create_guessing_decoder(); expect_document(); } std::size_t process(std::span data, std::size_t offset) override { cmds_.emplace_back(Command::FILL_BUFFER, Count::ZERO_OR_ONE); std::size_t consumed = 0; while (true) { if (cmds_.empty()) { if (!buffer_->empty()) { delegate_->error("Extra data at end"); } return consumed; } auto current = cmds_.back(); auto const old_size = cmds_.size(); cmds_.pop_back(); Process ret; switch (current.command) { case Command::FILL_BUFFER: ret = fill_buffer(data, offset, consumed); break; case Command::MISC: ret = process_misc(current); break; case Command::SPACE: ret = process_space(current); break; case Command::ELEMENT: ret = process_element(current); break; case Command::COMMENT: ret = process_comment(current); break; case Command::PROCESSING_INSTRUCTION: ret = process_processing_instruction(current); break; case Command::XMLDECL: ret = process_xmldecl(current); break; case Command::ATTRIBUTE: ret = process_attribute(current); break; case Command::NAME: ret = process_name(current); break; case Command::ATTRIBUTE_VALUE: ret = process_attribute_value(current); break; case Command::EQUAL: ret = process_equal(current); break; case Command::START_OR_EMPTY_TAG: ret = process_start_or_empty_tag(current); break; case Command::END_TAG: ret = process_end_tag(current); break; } switch (ret) { case Process::NEED_MORE: case Process::ERROR: cmds_.push_back(current); assert(cmds_.size() == old_size); return consumed; case Process::CONTINUE: break; } } } uint64_t line() const override { return line_; } uint64_t column() const override { return column_; } private: enum class Process { NEED_MORE, ERROR, CONTINUE, }; enum class Match { FULL_MATCH, PARTIAL_MATCH, NO_MATCH, }; enum class Command { FILL_BUFFER, ATTRIBUTE, ATTRIBUTE_VALUE, COMMENT, ELEMENT, END_TAG, EQUAL, MISC, NAME, PROCESSING_INSTRUCTION, SPACE, START_OR_EMPTY_TAG, XMLDECL, }; enum class Count { ONE, ONE_OR_MANY, ZERO_OR_ONE, ZERO_OR_MANY, }; struct CommandItem { Command const command; Count const count; std::size_t offset; CommandItem(Command command, Count count, std::size_t offset = 0) : command(command), count(count), offset(offset) {} }; struct StackItem { std::vector offsets; }; Process fill_buffer(std::span data, std::size_t offset, std::size_t& consumed) { if (offset >= data.size()) return Process::NEED_MORE; std::size_t tmp = offset; auto wspan = buffer_->wspan(4); std::size_t wrote = 0; switch (decoder_->decode(data, tmp, wspan, wrote)) { case Decoder::State::GOOD: break; case Decoder::State::NEED_MORE: return Process::NEED_MORE; case Decoder::State::INVALID: delegate_->error("Invalid data"); return Process::ERROR; } buffer_->commit(wrote); consumed = tmp - offset; return Process::CONTINUE; } void expect_document() { // document := prolog element Misc* expect_misc(Count::ZERO_OR_MANY); expect_element(Count::ONE); expect_prolog(); } void expect_misc(Count count) { cmds_.emplace_back(Command::MISC, count); } void expect_element(Count count) { // element ::= EmptyElemTag | STag content ETag cmds_.emplace_back(Command::START_OR_EMPTY_TAG, count); } void expect_end_tag(Count count) { cmds_.emplace_back(Command::END_TAG, count); } void expect_prolog() { // prolog := XMLDecl? Misc* (doctypedecl Misc*)? expect_misc(Count::ZERO_OR_MANY); expect_doctypedecl(Count::ZERO_OR_ONE); expect_misc(Count::ZERO_OR_MANY); expect_xmldecl(Count::ZERO_OR_ONE); } void expect_xmldecl(Count count) { cmds_.emplace_back(Command::XMLDECL, count); } void expect_doctypedecl(Count) { // TODO } void expect_comment(Count count, std::size_t start_offset = 0) { // Comment should never be more than one, should be MISC that is repeated. assert(count == Count::ONE); cmds_.emplace_back(Command::COMMENT, count, start_offset); } void expect_content(Count) { // TODO } void expect_pi(Count count, std::size_t start_offset = 0) { // PI should never be more than one, should be MISC that is repeated. assert(count == Count::ONE); cmds_.emplace_back(Command::PROCESSING_INSTRUCTION, count, start_offset); } void expect_space(Count count) { // There is not way to have SS as S is continous, so we should never // ask for more than one or zero. assert(count == Count::ZERO_OR_ONE || count == Count::ONE); cmds_.emplace_back(Command::SPACE, count); } void expect_attribute(Count count) { switch (count) { case Count::ONE_OR_MANY: cmds_.emplace_back(Command::ATTRIBUTE, Count::ZERO_OR_MANY); case Count::ONE: // Attribute ::= Name Eq AttValue expect_attribute_value(Count::ONE); expect_equal(Count::ONE); expect_name(Count::ONE); expect_space(Count::ONE); break; case Count::ZERO_OR_ONE: case Count::ZERO_OR_MANY: cmds_.emplace_back(Command::ATTRIBUTE, count); break; } } void expect_attribute_value(Count count) { cmds_.emplace_back(Command::ATTRIBUTE_VALUE, count); } void expect_equal(Count count) { // Eq ::= S? '=' S? expect_space(Count::ZERO_OR_ONE); cmds_.emplace_back(Command::EQUAL, count); expect_space(Count::ZERO_OR_ONE); } void expect_name(Count count) { cmds_.emplace_back(Command::NAME, count); } Process process_misc(CommandItem const& item) { // Misc := Comment | PI | S assert(item.offset == 0); switch (match("", item.offset); switch (match) { case Match::FULL_MATCH: { auto data = buffer_->rspan(item.offset); assert(data.size() >= item.offset); delegate_->comment( make_string_view(data.subspan(3, item.offset - 3))); buffer_->consume(item.offset + 3); return Process::CONTINUE; } case Match::NO_MATCH: case Match::PARTIAL_MATCH: return Process::NEED_MORE; } } Process process_processing_instruction(CommandItem& item) { if (item.offset == 0) { switch (match_consume("error("PI not supported"); return Process::ERROR; } void add_to_stack(CommandItem const& item, std::size_t offset) { cmds_.emplace_back(item.command, item.count, offset); stack_.emplace_back(); buffer_ = make_read_view_buffer(std::move(buffer_)); buffer_->consume(offset); } std::size_t pop_stack(std::vector& attr) { assert(!stack_.empty()); std::swap(attr, stack_.back().offsets); auto* read_view = static_cast(buffer_.get()); auto consumed = read_view->consumed(); buffer_ = read_view->release(); stack_.pop_back(); return consumed; } Process process_xmldecl(CommandItem const& item) { // XMLDecl ::= '' if (item.offset == 0) { switch (match("")) { case Match::FULL_MATCH: break; case Match::PARTIAL_MATCH: return Process::NEED_MORE; case Match::NO_MATCH: delegate_->error(std::format("Expected end of {}", command_name(item.command))); return Process::ERROR; } std::vector attr; auto const consumed = pop_stack(attr); // Now we're back to the real buffer auto data = buffer_->rspan(consumed); std::size_t a = 0; if (a + 4 <= attr.size() && make_string_view(data.subspan(attr[a + 0], attr[a + 1])) == "version") { auto version = make_string_view(data.subspan(attr[a + 2] + 1, attr[a + 3] - 2)); if (!valid_version(version)) { delegate_->error(std::format("Unsupported xmldecl version, {}", version)); return Process::ERROR; } a += 4; } else { // No version delegate_->error("Invalid xmldecl, must have a version attribute first."); return Process::ERROR; } if (a + 4 <= attr.size() && make_string_view(data.subspan(attr[a + 0], attr[a + 1])) == "encoding") { auto encoding = make_string_view(data.subspan(attr[a + 2] + 1, attr[a + 3] - 2)); if (forced_decoder_) { // encoding value is ignored // TODO: Should we check that it is valid anyway? } else { auto decoder = pick_decoder_for_encoding(encoding, nullptr); if (!decoder && decoder_factory_) decoder = decoder_factory_->create(encoding); if (!decoder) { delegate_->error(std::format("Unknown encoding {}", encoding)); return Process::ERROR; } std::swap(decoder_, decoder); // TODO: Re-decode the rest of the buffer? } a += 4; } if (a + 4 <= attr.size() && make_string_view(data.subspan(attr[a + 0], attr[a + 1])) == "standalone") { auto sd = make_string_view(data.subspan(attr[a + 2] + 1, attr[a + 3] - 2)); if (sd == "yes") { // TODO: Handle standalone == yes } else if (sd == "no") { // TODO: Handle standalone == no } else { delegate_->error(std::format( "Invalid xmldecl, standalone attribute has unsupported value, {}", sd)); return Process::ERROR; } a += 4; } if (a < attr.size()) { delegate_->error( std::format("Invalid xmldecl, unknown attribute, {}", make_string_view(data.subspan(attr[a + 0], attr[a + 1])))); return Process::ERROR; } buffer_->consume(consumed); return Process::CONTINUE; } Process process_start_or_empty_tag(CommandItem const& item) { // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' // STag ::= '<' Name (S Attribute)* S? '>' if (item.offset == 0) { switch (match("<")) { case Match::FULL_MATCH: add_to_stack(item, /* offset */ 1); expect_space(Count::ZERO_OR_ONE); expect_attribute(Count::ZERO_OR_MANY); expect_name(Count::ONE); return Process::CONTINUE; case Match::PARTIAL_MATCH: return Process::NEED_MORE; case Match::NO_MATCH: return no_match(item); } } assert(item.offset == 1); bool empty_tag; // Remember that this is still reading for the read view buffer. switch (match_consume("/>")) { case Match::FULL_MATCH: empty_tag = true; break; case Match::PARTIAL_MATCH: return Process::NEED_MORE; case Match::NO_MATCH: switch (match_consume(">")) { case Match::FULL_MATCH: empty_tag = false; break; case Match::PARTIAL_MATCH: return Process::NEED_MORE; case Match::NO_MATCH: delegate_->error(std::format("Expected end of {}", command_name(item.command))); return Process::ERROR; } break; } std::vector attr; auto const consumed = pop_stack(attr); // Now we're back to the real buffer auto data = buffer_->rspan(consumed); assert(attr.size() >= 2); auto name = make_string_view(data.subspan(attr[0], attr[1])); AttributesImpl attributes; if (!attributes.init(entities_, data, std::move(attr), 2)) { delegate_->error("Invalid references in attribute values"); return Process::ERROR; } add_if_more(item); if (empty_tag) { delegate_->empty_element(name, attributes); } else { delegate_->start_element(name, attributes); expect_end_tag(Count::ONE); expect_content(Count::ONE); } buffer_->consume(consumed); return Process::CONTINUE; } Process process_end_tag(CommandItem const& item) { // ETag ::= '' if (item.offset == 0) { switch (match("")) { case Match::FULL_MATCH: break; case Match::PARTIAL_MATCH: return Process::NEED_MORE; case Match::NO_MATCH: delegate_->error(std::format("Expected end of {}", command_name(item.command))); return Process::ERROR; } std::vector attr; auto const consumed = pop_stack(attr); // Now we're back to the real buffer auto data = buffer_->rspan(consumed); assert(attr.size() == 2); auto name = make_string_view(data.subspan(attr[0], attr[1])); add_if_more(item); delegate_->end_element(name); buffer_->consume(consumed); return Process::CONTINUE; } static bool valid_version(std::string_view version) { if (version.size() < 3) return false; if (!version.starts_with("1.")) return false; for (std::size_t i = 2; i < version.size(); ++i) { if (!is_digit(version[i])) return false; } return true; } Process process_element(CommandItem& item) { // TODO delegate_->error("Element is not yet supported"); return Process::ERROR; } Process consume_space(std::size_t& count, uint32_t& last_char) { auto data = buffer_->rspan(4); std::size_t consumed = 0; while (true) { std::size_t offset = consumed; auto c = utf::read8(data, offset); if (c == utf::NEED_MORE) { buffer_->consume(consumed); return Process::NEED_MORE; } if (c == utf::INVALID || !valid_char(c)) return invalid_char(data, offset); if (!is_ws(c)) { last_char = c; buffer_->consume(consumed); return Process::CONTINUE; } ++count; handle_ws(c); consumed = offset; } } Process process_space(CommandItem& item) { // S ::= (#x20 | #x9 | #xD | #xA)+ // item.offset is only used to count spaces. We consume each space as it // is found so no offset in buffer. uint32_t unused; auto ret = consume_space(item.offset, unused); if (ret != Process::CONTINUE) return ret; if (item.offset == 0) return no_match(item); add_if_more(item); return Process::CONTINUE; } void add_if_more(CommandItem const& item) { switch (item.count) { case Count::ONE: break; case Count::ONE_OR_MANY: cmds_.emplace_back(item.command, Count::ZERO_OR_MANY); break; case Count::ZERO_OR_ONE: break; case Count::ZERO_OR_MANY: cmds_.emplace_back(item.command, item.count); } } Match find(std::string_view str, std::size_t& offset) { auto data = buffer_->rspan(offset + str.size()); std::size_t i = 0; while (offset < data.size()) { if (str[i] == data[offset]) { ++i; if (i == str.size()) { offset -= i; return Match::FULL_MATCH; } } else { i = 0; } ++offset; } if (i > 0) { offset -= i; return Match::PARTIAL_MATCH; } return Match::NO_MATCH; } Match match(std::string_view str, std::size_t offset = 0) { auto data = buffer_->rspan(offset + str.size()); if (data.size() <= offset) return Match::PARTIAL_MATCH; auto const avail = std::min(str.size(), data.size() - offset); for (std::size_t i = 0; i < avail; ++i) { if (str[i] != data[offset + i]) return Match::NO_MATCH; } if (avail < str.size()) return Match::PARTIAL_MATCH; return Match::FULL_MATCH; } Match match_consume(std::string_view str) { auto ret = match(str); if (ret == Match::FULL_MATCH) buffer_->consume(str.size()); return ret; } Match match_s() { auto data = buffer_->rspan(4); std::size_t offset = 0; auto c = utf::read8(data, offset); if (c == utf::NEED_MORE) return data.size() == 0 ? Match::PARTIAL_MATCH : Match::NO_MATCH; if (c == utf::INVALID) return Match::NO_MATCH; if (!valid_char(c) || !is_ws(c)) return Match::NO_MATCH; return Match::FULL_MATCH; } Process no_match(CommandItem const& item) { switch (item.count) { case Count::ONE: case Count::ONE_OR_MANY: delegate_->error(std::format("Expected {}", command_name(item.command))); return Process::ERROR; case Count::ZERO_OR_ONE: case Count::ZERO_OR_MANY: break; } return Process::CONTINUE; } void handle_ws(uint32_t c) { if (c == '\n') { ++line_; column_ = 0; } else { ++column_; } } Process invalid_char(std::span data, std::size_t offset) { delegate_->error(std::format("Invalid char {:02x}", data[offset])); return Process::ERROR; } static std::string_view command_name(Command command) { switch (command) { case Command::MISC: return "misc"sv; case Command::FILL_BUFFER: return "more data"sv; case Command::ELEMENT: return "element"sv; case Command::SPACE: return "whitespace"sv; case Command::COMMENT: return "comment"sv; case Command::PROCESSING_INSTRUCTION: return "processing instruction"sv; case Command::XMLDECL: return "xml declaration"sv; case Command::ATTRIBUTE: return "attribute"sv; case Command::ATTRIBUTE_VALUE: return "attribute value"sv; case Command::NAME: return "name"sv; case Command::EQUAL: return "equal sign (=)"sv; case Command::START_OR_EMPTY_TAG: return "element"sv; case Command::END_TAG: return "end tag"sv; } assert(false); return {}; } std::shared_ptr delegate_; std::shared_ptr decoder_factory_; std::unique_ptr decoder_; bool const forced_decoder_; std::unique_ptr buffer_; Entities entities_; std::vector cmds_; std::vector stack_; uint64_t line_{1}; uint64_t column_{0}; }; } // namespace std::unique_ptr create_processor( std::shared_ptr delegate, std::shared_ptr decoder_factory, std::optional force_encoding, std::optional opt_default_buffer_size, std::optional opt_max_buffer_size) { std::unique_ptr decoder; if (force_encoding.has_value()) { decoder = pick_decoder_for_encoding(force_encoding.value(), decoder_factory.get()); } std::size_t default_buffer_size = kDefaultBufferSize; if (opt_default_buffer_size.has_value()) default_buffer_size = std::max(kMinBufferSize, opt_default_buffer_size.value()); // This value is documented in public headers. Do NOT change. std::size_t max_buffer_size = 10 * 1024 * 1024; // No validation for user set value. If it is too small MAX_MEMORY_EXCEEDED // error will be thrown. If it is too large we will get OUT_OF_MEMORY or // crash depending on platform. if (opt_max_buffer_size.has_value()) max_buffer_size = opt_max_buffer_size.value(); return std::make_unique(std::move(delegate), std::move(decoder_factory), std::move(decoder), default_buffer_size, max_buffer_size); } std::unique_ptr Processor::create(std::shared_ptr delegate) { return create_processor(std::move(delegate), nullptr, std::nullopt, std::nullopt, std::nullopt); } } // namespace sax } // namespace modxml