#include "sax_processor.hh" #include "sax_decoder.hh" #include "processor.hh" #include "utils.hh" #include #include #include namespace modxml { namespace sax { namespace { // 2.2 Characters // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] inline bool valid_char(uint32_t c) { // Assume valid unicode (U+0 - U+10ffff except surrogate blocks) if (c >= 0x20 && c <= 0xfffd) return true; if (c == 0x9 || c == 0xa || c == 0xd) return true; return c >= 0x10000; } // 2.3 Common Syntactic Constructs // [3] S ::= (#x20 | #x9 | #xD | #xA)+ inline bool is_ws(uint32_t c) { // Assume we already checked for valid_char. return c <= 0x20; } // [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] // [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] inline bool is_namestartchar(uint32_t c) { if (c < 0x41 /* A */) return c == 0x3a /* : */; if (c <= 0x5a /* Z */) return true; if (c < 0x61 /* a */) return c == 0x5f /* _ */; if (c <= 0x7a /* z */) return true; if (c < 0xc0) return false; if (c < 0x300) return c != 0xd7 && c != 0xf7; if (c > 0x37d && c < 0x37f) return false; if (c > 0x1fff && c < 0x200c) return false; if (c > 0x200d && c < 0x2070) return false; if (c > 0x218f && c < 0x2c00) return false; if (c > 0x2fef && c < 0x3001) return false; // Already valid_char so don't check for surrogate pair here. if (c > 0xdfff && c < 0xf900) return false; if (c > 0xfdcf && c < 0xfdf0) return false; if (c > 0xfffd && c < 0x10000) return false; return true; } inline bool is_namechar(uint32_t c) { return is_namestartchar(c) || (c >= 0x2d /* - */ && c <= 0x2e /* . */) || (c >= 0x30 /* 0 */ && c <= 0x39 /* 9 */) || (c == 0xb7) || (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040); } /* [5] Name ::= NameStartChar (NameChar)* [6] Names ::= Name (#x20 Name)* [7] Nmtoken ::= (NameChar)+ [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* */ class ProcessorImpl : public Processor { public: ProcessorImpl(std::shared_ptr delegate, std::shared_ptr decoder_factory, std::unique_ptr decoder, std::size_t default_buffer_size, std::size_t max_buffer_size) : delegate_(std::move(delegate)), decoder_factory_(std::move(decoder_factory)), decoder_(std::move(decoder)), default_buffer_size_(default_buffer_size), max_buffer_size_(max_buffer_size) {} private: std::shared_ptr delegate_; std::shared_ptr decoder_factory_; std::unique_ptr decoder_; std::size_t default_buffer_size_; std::size_t max_buffer_size_; }; } // namespace std::unique_ptr create_processor( std::shared_ptr delegate, std::shared_ptr decoder_factory, std::optional force_encoding, std::optional opt_default_buffer_size, std::optional opt_max_buffer_size) { std::unique_ptr decoder; if (force_encoding.has_value()) { decoder = pick_decoder_for_encoding(force_encoding.value(), decoder_factory.get()); } std::size_t default_buffer_size = 8192; if (opt_default_buffer_size.has_value()) default_buffer_size = std::max(static_cast(128), opt_default_buffer_size.value()); // This value is documented in public headers. Do NOT change. std::size_t max_buffer_size = 10 * 1024 * 1024; // No validation for user set value. If it is too small MAX_MEMORY_EXCEEDED // error will be thrown. If it is too large we will get OUT_OF_MEMORY or // crash depending on platform. if (opt_max_buffer_size.has_value()) max_buffer_size = opt_max_buffer_size.value(); return std::make_unique(std::move(delegate), std::move(decoder_factory), std::move(decoder), default_buffer_size, max_buffer_size); } std::unique_ptr create(std::shared_ptr delegate) { return create_processor(std::move(delegate), nullptr, std::nullopt, std::nullopt, std::nullopt); } } // namespace sax } // namespace modxml