diff options
Diffstat (limited to 'sax/src/sax_processor.cc')
| -rw-r--r-- | sax/src/sax_processor.cc | 145 |
1 files changed, 145 insertions, 0 deletions
diff --git a/sax/src/sax_processor.cc b/sax/src/sax_processor.cc new file mode 100644 index 0000000..ea9f753 --- /dev/null +++ b/sax/src/sax_processor.cc @@ -0,0 +1,145 @@ +#include "sax_processor.hh" + +#include "sax_decoder.hh" +#include "processor.hh" +#include "utils.hh" + +#include <algorithm> +#include <optional> +#include <utility> + +namespace modxml { +namespace sax { + +namespace { + +// 2.2 Characters +// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + +inline bool valid_char(uint32_t c) { + // Assume valid unicode (U+0 - U+10ffff except surrogate blocks) + if (c >= 0x20 && c <= 0xfffd) + return true; + if (c == 0x9 || c == 0xa || c == 0xd) + return true; + return c >= 0x10000; +} + +// 2.3 Common Syntactic Constructs +// [3] S ::= (#x20 | #x9 | #xD | #xA)+ + +inline bool is_ws(uint32_t c) { + // Assume we already checked for valid_char. + return c <= 0x20; +} + +// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] +// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] + +inline bool is_namestartchar(uint32_t c) { + if (c < 0x41 /* A */) + return c == 0x3a /* : */; + if (c <= 0x5a /* Z */) + return true; + if (c < 0x61 /* a */) + return c == 0x5f /* _ */; + if (c <= 0x7a /* z */) + return true; + if (c < 0xc0) + return false; + if (c < 0x300) + return c != 0xd7 && c != 0xf7; + if (c > 0x37d && c < 0x37f) + return false; + if (c > 0x1fff && c < 0x200c) + return false; + if (c > 0x200d && c < 0x2070) + return false; + if (c > 0x218f && c < 0x2c00) + return false; + if (c > 0x2fef && c < 0x3001) + return false; + // Already valid_char so don't check for surrogate pair here. + if (c > 0xdfff && c < 0xf900) + return false; + if (c > 0xfdcf && c < 0xfdf0) + return false; + if (c > 0xfffd && c < 0x10000) + return false; + return true; +} + +inline bool is_namechar(uint32_t c) { + return is_namestartchar(c) || (c >= 0x2d /* - */ && c <= 0x2e /* . */) || + (c >= 0x30 /* 0 */ && c <= 0x39 /* 9 */) || (c == 0xb7) || + (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040); +} + +/* [5] Name ::= NameStartChar (NameChar)* +[6] Names ::= Name (#x20 Name)* +[7] Nmtoken ::= (NameChar)+ +[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* +*/ + +class ProcessorImpl : public Processor { + public: + ProcessorImpl(std::shared_ptr<Delegate> delegate, + std::shared_ptr<DecoderFactory> decoder_factory, + std::unique_ptr<Decoder> decoder, + std::size_t default_buffer_size, + std::size_t max_buffer_size) + : delegate_(std::move(delegate)), + decoder_factory_(std::move(decoder_factory)), + decoder_(std::move(decoder)), + default_buffer_size_(default_buffer_size), + max_buffer_size_(max_buffer_size) {} + + private: + std::shared_ptr<Delegate> delegate_; + std::shared_ptr<DecoderFactory> decoder_factory_; + std::unique_ptr<Decoder> decoder_; + std::size_t default_buffer_size_; + std::size_t max_buffer_size_; +}; + +} // namespace + +std::unique_ptr<Processor> create_processor( + std::shared_ptr<Delegate> delegate, + std::shared_ptr<DecoderFactory> decoder_factory, + std::optional<std::string> force_encoding, + std::optional<std::size_t> opt_default_buffer_size, + std::optional<std::size_t> opt_max_buffer_size) { + + std::unique_ptr<Decoder> decoder; + if (force_encoding.has_value()) { + decoder = pick_decoder_for_encoding(force_encoding.value(), + decoder_factory.get()); + } + + std::size_t default_buffer_size = 8192; + if (opt_default_buffer_size.has_value()) + default_buffer_size = std::max(static_cast<std::size_t>(128), + opt_default_buffer_size.value()); + // This value is documented in public headers. Do NOT change. + std::size_t max_buffer_size = 10 * 1024 * 1024; + // No validation for user set value. If it is too small MAX_MEMORY_EXCEEDED + // error will be thrown. If it is too large we will get OUT_OF_MEMORY or + // crash depending on platform. + if (opt_max_buffer_size.has_value()) + max_buffer_size = opt_max_buffer_size.value(); + + return std::make_unique<ProcessorImpl>(std::move(delegate), + std::move(decoder_factory), + std::move(decoder), + default_buffer_size, + max_buffer_size); +} + +std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate) { + return create_processor(std::move(delegate), nullptr, + std::nullopt, std::nullopt, std::nullopt); +} + +} // namespace sax +} // namespace modxml |
