diff options
Diffstat (limited to 'sax/src')
| -rw-r--r-- | sax/src/decoder.cc | 321 | ||||
| -rw-r--r-- | sax/src/decoder.hh | 33 | ||||
| -rw-r--r-- | sax/src/processor.hh | 27 | ||||
| -rw-r--r-- | sax/src/sax_processor.cc | 145 | ||||
| -rw-r--r-- | sax/src/sax_processor_builder.cc | 62 | ||||
| -rw-r--r-- | sax/src/utils.cc | 70 | ||||
| -rw-r--r-- | sax/src/utils.hh | 22 |
7 files changed, 680 insertions, 0 deletions
diff --git a/sax/src/decoder.cc b/sax/src/decoder.cc new file mode 100644 index 0000000..30b1735 --- /dev/null +++ b/sax/src/decoder.cc @@ -0,0 +1,321 @@ +#include "decoder.hh" + +#include "macros.hh" +#include "sax_decoder.hh" +#include "utf16.hh" +#include "utf32.hh" +#include "utf8.hh" +#include "utf_error.hh" + +namespace modxml { +namespace sax { + +namespace { + +class UtfDecoder : public Decoder { + public: + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + if (bom_ == -1) UNLIKELY { + std::size_t tmp = in_offset; + uint32_t ret = read(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + if (ret == 0xfeff) { + // To allow offset to advance and to return, we need to + // read at least one more character completely. + ret = read(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + bom_ = 1; + } else { + bom_ = 0; + } + in_offset = tmp; + out[out_offset++] = ret; + if (out_offset == out_size) + return State::GOOD; + } + + do { + uint32_t ret = read(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + return State::GOOD; + } + + protected: + UtfDecoder() = default; + + virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0; + + private: + int8_t bom_{-1}; +}; + +class Utf8Decoder : public UtfDecoder { + public: + Utf8Decoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read8(data, offset); + } +}; + +class Utf16BeDecoder : public UtfDecoder { + public: + Utf16BeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read16be(data, offset); + } +}; + +class Utf16LeDecoder : public UtfDecoder { + public: + Utf16LeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read16le(data, offset); + } +}; + +class Utf32BeDecoder : public UtfDecoder { + public: + Utf32BeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read32be(data, offset); + } +}; + +class Utf32LeDecoder : public UtfDecoder { + public: + Utf32LeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read32le(data, offset); + } +}; + +class Utf16Decoder : public Decoder { + public: + Utf16Decoder() = default; + + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + if (endian_ == -1) UNLIKELY { + std::size_t tmp = in_offset; + uint32_t ret = utf::read16be(in, tmp); + int8_t endian; + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + if (ret == 0xfeff) { + endian = 1; // Big endian + } else if (ret == 0xfffe) { + endian = 0; // Little endian + } else { + return State::INVALID; + } + + // To allow offset to advance and to return, we need to + // read at least one more character completely. + ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + + endian_ = endian; + in_offset = tmp; + out[out_offset++] = ret; + if (out_offset == out_size) + return State::GOOD; + } + + if (endian_ == 1) { + do { + uint32_t ret = utf::read16be(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } else { + do { + uint32_t ret = utf::read16le(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } + return State::GOOD; + } + + private: + int8_t endian_{-1}; +}; + +class Utf32Decoder : public Decoder { + public: + Utf32Decoder() = default; + + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + if (endian_ == -1) UNLIKELY { + std::size_t tmp = in_offset; + uint32_t ret = utf::read32be(in, tmp); + int8_t endian; + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + tmp = in_offset; + ret = utf::read32le(in, tmp); + if (ret == 0xfeff) { + endian = 0; // Little endian + } else { + return State::INVALID; + } + } else if (ret == 0xfeff) { + endian = 1; // Big endian + } else { + return State::INVALID; + } + + // To allow offset to advance and to return, we need to + // read the next character completely. + ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + + endian_ = endian; + in_offset = tmp; + out[out_offset++] = ret; + if (out_offset == out_size) + return State::GOOD; + } + + if (endian_ == 1) { + do { + uint32_t ret = utf::read32be(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } else { + do { + uint32_t ret = utf::read32le(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } + return State::GOOD; + } + + private: + int8_t endian_{-1}; +}; + +class AsciiDecoder : public Decoder { + public: + AsciiDecoder() = default; + + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + do { + if (in_offset == in.size()) + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + if (in[in_offset] & 0x80) + return out_offset > out_start ? State::GOOD : State::INVALID; + out[out_offset++] = in[in_offset++]; + } while (out_offset < out_size); + return State::GOOD; + } +}; + +} // namespace + +std::unique_ptr<Decoder> create_utf8_decoder() { + return std::make_unique<Utf8Decoder>(); +} + +std::unique_ptr<Decoder> create_utf16be_decoder() { + return std::make_unique<Utf16BeDecoder>(); +} + +std::unique_ptr<Decoder> create_utf16le_decoder() { + return std::make_unique<Utf16LeDecoder>(); +} + +std::unique_ptr<Decoder> create_utf32be_decoder() { + return std::make_unique<Utf32BeDecoder>(); +} + +std::unique_ptr<Decoder> create_utf32le_decoder() { + return std::make_unique<Utf32LeDecoder>(); +} + +std::unique_ptr<Decoder> create_utf16_decoder() { + return std::make_unique<Utf16Decoder>(); +} + +std::unique_ptr<Decoder> create_utf32_decoder() { + return std::make_unique<Utf32Decoder>(); +} + +std::unique_ptr<Decoder> create_ascii_decoder() { + return std::make_unique<AsciiDecoder>(); +} + +} // namespace sax +} // namespace modxml + diff --git a/sax/src/decoder.hh b/sax/src/decoder.hh new file mode 100644 index 0000000..bd2a99a --- /dev/null +++ b/sax/src/decoder.hh @@ -0,0 +1,33 @@ +#ifndef DECODER_HH +#define DECODER_HH + +#include "macros.hh" + +#include <memory> + +namespace modxml { +namespace sax { + +class Decoder; + +// UTF-8 with optional BOM +std::unique_ptr<Decoder> HIDDEN create_utf8_decoder(); +// UTF-16 with BOM +std::unique_ptr<Decoder> HIDDEN create_utf16_decoder(); +// UTF-16BE with optional BOM +std::unique_ptr<Decoder> HIDDEN create_utf16be_decoder(); +// UTF-16LE with optional BOM +std::unique_ptr<Decoder> HIDDEN create_utf16le_decoder(); +// UTF-32 with BOM +std::unique_ptr<Decoder> HIDDEN create_utf32_decoder(); +// UTF-32BE with optional BOM +std::unique_ptr<Decoder> HIDDEN create_utf32be_decoder(); +// UTF-32LE with optional BOM +std::unique_ptr<Decoder> HIDDEN create_utf32le_decoder(); +// US-ASCII +std::unique_ptr<Decoder> HIDDEN create_ascii_decoder(); + +} // namespace sax +} // namespace modxml + +#endif // DECODER_HH diff --git a/sax/src/processor.hh b/sax/src/processor.hh new file mode 100644 index 0000000..4a2de29 --- /dev/null +++ b/sax/src/processor.hh @@ -0,0 +1,27 @@ +#ifndef PROCESSOR_HH +#define PROCESSOR_HH + +#include "macros.hh" + +#include <memory> +#include <optional> +#include <string> + +namespace modxml { +namespace sax { + +class DecoderFactory; +class Delegate; +class Processor; + +std::unique_ptr<Processor> HIDDEN create_processor( + std::shared_ptr<Delegate> delegate, + std::shared_ptr<DecoderFactory> decoder_factory, + std::optional<std::string> force_encoding, + std::optional<std::size_t> default_buffer_size, + std::optional<std::size_t> max_buffer_size); + +} // namespace sax +} // namespace modxml + +#endif // PROCESSOR_HH diff --git a/sax/src/sax_processor.cc b/sax/src/sax_processor.cc new file mode 100644 index 0000000..ea9f753 --- /dev/null +++ b/sax/src/sax_processor.cc @@ -0,0 +1,145 @@ +#include "sax_processor.hh" + +#include "sax_decoder.hh" +#include "processor.hh" +#include "utils.hh" + +#include <algorithm> +#include <optional> +#include <utility> + +namespace modxml { +namespace sax { + +namespace { + +// 2.2 Characters +// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + +inline bool valid_char(uint32_t c) { + // Assume valid unicode (U+0 - U+10ffff except surrogate blocks) + if (c >= 0x20 && c <= 0xfffd) + return true; + if (c == 0x9 || c == 0xa || c == 0xd) + return true; + return c >= 0x10000; +} + +// 2.3 Common Syntactic Constructs +// [3] S ::= (#x20 | #x9 | #xD | #xA)+ + +inline bool is_ws(uint32_t c) { + // Assume we already checked for valid_char. + return c <= 0x20; +} + +// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] +// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] + +inline bool is_namestartchar(uint32_t c) { + if (c < 0x41 /* A */) + return c == 0x3a /* : */; + if (c <= 0x5a /* Z */) + return true; + if (c < 0x61 /* a */) + return c == 0x5f /* _ */; + if (c <= 0x7a /* z */) + return true; + if (c < 0xc0) + return false; + if (c < 0x300) + return c != 0xd7 && c != 0xf7; + if (c > 0x37d && c < 0x37f) + return false; + if (c > 0x1fff && c < 0x200c) + return false; + if (c > 0x200d && c < 0x2070) + return false; + if (c > 0x218f && c < 0x2c00) + return false; + if (c > 0x2fef && c < 0x3001) + return false; + // Already valid_char so don't check for surrogate pair here. + if (c > 0xdfff && c < 0xf900) + return false; + if (c > 0xfdcf && c < 0xfdf0) + return false; + if (c > 0xfffd && c < 0x10000) + return false; + return true; +} + +inline bool is_namechar(uint32_t c) { + return is_namestartchar(c) || (c >= 0x2d /* - */ && c <= 0x2e /* . */) || + (c >= 0x30 /* 0 */ && c <= 0x39 /* 9 */) || (c == 0xb7) || + (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040); +} + +/* [5] Name ::= NameStartChar (NameChar)* +[6] Names ::= Name (#x20 Name)* +[7] Nmtoken ::= (NameChar)+ +[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* +*/ + +class ProcessorImpl : public Processor { + public: + ProcessorImpl(std::shared_ptr<Delegate> delegate, + std::shared_ptr<DecoderFactory> decoder_factory, + std::unique_ptr<Decoder> decoder, + std::size_t default_buffer_size, + std::size_t max_buffer_size) + : delegate_(std::move(delegate)), + decoder_factory_(std::move(decoder_factory)), + decoder_(std::move(decoder)), + default_buffer_size_(default_buffer_size), + max_buffer_size_(max_buffer_size) {} + + private: + std::shared_ptr<Delegate> delegate_; + std::shared_ptr<DecoderFactory> decoder_factory_; + std::unique_ptr<Decoder> decoder_; + std::size_t default_buffer_size_; + std::size_t max_buffer_size_; +}; + +} // namespace + +std::unique_ptr<Processor> create_processor( + std::shared_ptr<Delegate> delegate, + std::shared_ptr<DecoderFactory> decoder_factory, + std::optional<std::string> force_encoding, + std::optional<std::size_t> opt_default_buffer_size, + std::optional<std::size_t> opt_max_buffer_size) { + + std::unique_ptr<Decoder> decoder; + if (force_encoding.has_value()) { + decoder = pick_decoder_for_encoding(force_encoding.value(), + decoder_factory.get()); + } + + std::size_t default_buffer_size = 8192; + if (opt_default_buffer_size.has_value()) + default_buffer_size = std::max(static_cast<std::size_t>(128), + opt_default_buffer_size.value()); + // This value is documented in public headers. Do NOT change. + std::size_t max_buffer_size = 10 * 1024 * 1024; + // No validation for user set value. If it is too small MAX_MEMORY_EXCEEDED + // error will be thrown. If it is too large we will get OUT_OF_MEMORY or + // crash depending on platform. + if (opt_max_buffer_size.has_value()) + max_buffer_size = opt_max_buffer_size.value(); + + return std::make_unique<ProcessorImpl>(std::move(delegate), + std::move(decoder_factory), + std::move(decoder), + default_buffer_size, + max_buffer_size); +} + +std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate) { + return create_processor(std::move(delegate), nullptr, + std::nullopt, std::nullopt, std::nullopt); +} + +} // namespace sax +} // namespace modxml diff --git a/sax/src/sax_processor_builder.cc b/sax/src/sax_processor_builder.cc new file mode 100644 index 0000000..8817099 --- /dev/null +++ b/sax/src/sax_processor_builder.cc @@ -0,0 +1,62 @@ +#include "sax_processor_builder.hh" + +#include "processor.hh" +#include "sax_processor.hh" + +#include <optional> +#include <utility> + +namespace modxml { +namespace sax { + +namespace { + +class ProcessorBuilderImpl : public ProcessorBuilder { + public: + ProcessorBuilder* force_encoding(std::string const& str) override { + force_encoding_ = str; + return this; + } + + ProcessorBuilder* custom_decoder_factory( + std::shared_ptr<DecoderFactory> custom_decoder_factory) override { + decoder_factory_ = std::move(custom_decoder_factory); + return this; + } + + ProcessorBuilder* set_default_buffer_size(std::size_t size) override { + default_buffer_size_ = size; + return this; + } + + ProcessorBuilder* set_max_buffer_size(std::size_t size) override { + max_buffer_size_ = size; + return this; + } + + std::unique_ptr<Processor> build( + std::shared_ptr<Delegate> delegate) const override { + return create_processor(std::move(delegate), + decoder_factory_, + force_encoding_, + default_buffer_size_, + max_buffer_size_); + } + + ProcessorBuilderImpl() = default; + + private: + std::shared_ptr<DecoderFactory> decoder_factory_; + std::optional<std::string> force_encoding_; + std::optional<std::size_t> default_buffer_size_; + std::optional<std::size_t> max_buffer_size_; +}; + +} // namespace + +std::unique_ptr<ProcessorBuilder> ProcessorBuilder::create() { + return std::make_unique<ProcessorBuilderImpl>(); +} + +} // namespace sax +} // namespace modxml diff --git a/sax/src/utils.cc b/sax/src/utils.cc new file mode 100644 index 0000000..f0366d5 --- /dev/null +++ b/sax/src/utils.cc @@ -0,0 +1,70 @@ +#include "utils.hh" + +#include "decoder.hh" +#include "sax_decoder.hh" +#include "sax_decoder_factory.hh" + +namespace modxml { +namespace sax { + +namespace { + +std::string cleanup_encoding(std::string const& str) { + std::string ret; + ret.reserve(str.size()); + for (auto c : str) { + if (c >= 'A' && c <= 'Z') { + ret.push_back(c | 0x20); + } else if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) { + ret.push_back(c); + } else if (c == '.' || c == '_' || c == '-') { + ret.push_back('-'); + } + } + return ret; +} + +} // namespace + +// Names inspired by: +// https://www.iana.org/assignments/character-sets/character-sets.xhtml +std::unique_ptr<Decoder> pick_decoder_for_encoding( + std::string const& encoding, DecoderFactory* factory) { + auto clean_enc = cleanup_encoding(encoding); + if (clean_enc == "utf-8" || clean_enc == "utf8") { + return create_utf8_decoder(); + } + if (clean_enc == "utf-16" || clean_enc == "utf16") { + return create_utf16_decoder(); + } + if (clean_enc == "utf-16be" || clean_enc == "utf16be") { + return create_utf16be_decoder(); + } + if (clean_enc == "utf-16le" || clean_enc == "utf16le") { + return create_utf16le_decoder(); + } + if (clean_enc == "utf-32" || clean_enc == "utf32") { + return create_utf32_decoder(); + } + if (clean_enc == "utf-32be" || clean_enc == "utf32be") { + return create_utf32be_decoder(); + } + if (clean_enc == "utf-32le" || clean_enc == "utf32le") { + return create_utf32le_decoder(); + } + if (clean_enc == "ascii" || clean_enc == "us-ascii" || + clean_enc == "usascii" || clean_enc == "iso-ir-6" || + clean_enc == "ansi-x3-4-1968" || clean_enc == "ansi-x3-4-1986" || + clean_enc == "iso-646-irv1991" || clean_enc == "iso646-us" || + clean_enc == "us" || clean_enc == "ibm367" || clean_enc == "cp367") { + return create_ascii_decoder(); + } + if (factory) { + return factory->create(encoding); + } + return nullptr; +} + +} // namespace sax + +} // namespace modxml diff --git a/sax/src/utils.hh b/sax/src/utils.hh new file mode 100644 index 0000000..206d003 --- /dev/null +++ b/sax/src/utils.hh @@ -0,0 +1,22 @@ +#ifndef UTILS_HH +#define UTILS_HH + +#include "macros.hh" + +#include <memory> +#include <string> + +namespace modxml { +namespace sax { + +class Decoder; +class DecoderFactory; + +std::unique_ptr<Decoder> HIDDEN pick_decoder_for_encoding( + std::string const& encoding, + DecoderFactory* factory); + +} // namespace sax +} // namespace modxml + +#endif // UTILS_HH |
