summaryrefslogtreecommitdiff
path: root/sax/src
diff options
context:
space:
mode:
Diffstat (limited to 'sax/src')
-rw-r--r--sax/src/decoder.cc321
-rw-r--r--sax/src/decoder.hh33
-rw-r--r--sax/src/processor.hh27
-rw-r--r--sax/src/sax_processor.cc145
-rw-r--r--sax/src/sax_processor_builder.cc62
-rw-r--r--sax/src/utils.cc70
-rw-r--r--sax/src/utils.hh22
7 files changed, 680 insertions, 0 deletions
diff --git a/sax/src/decoder.cc b/sax/src/decoder.cc
new file mode 100644
index 0000000..30b1735
--- /dev/null
+++ b/sax/src/decoder.cc
@@ -0,0 +1,321 @@
+#include "decoder.hh"
+
+#include "macros.hh"
+#include "sax_decoder.hh"
+#include "utf16.hh"
+#include "utf32.hh"
+#include "utf8.hh"
+#include "utf_error.hh"
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+class UtfDecoder : public Decoder {
+ public:
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ if (bom_ == -1) UNLIKELY {
+ std::size_t tmp = in_offset;
+ uint32_t ret = read(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+ if (ret == 0xfeff) {
+ // To allow offset to advance and to return, we need to
+ // read at least one more character completely.
+ ret = read(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+ bom_ = 1;
+ } else {
+ bom_ = 0;
+ }
+ in_offset = tmp;
+ out[out_offset++] = ret;
+ if (out_offset == out_size)
+ return State::GOOD;
+ }
+
+ do {
+ uint32_t ret = read(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ return State::GOOD;
+ }
+
+ protected:
+ UtfDecoder() = default;
+
+ virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0;
+
+ private:
+ int8_t bom_{-1};
+};
+
+class Utf8Decoder : public UtfDecoder {
+ public:
+ Utf8Decoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read8(data, offset);
+ }
+};
+
+class Utf16BeDecoder : public UtfDecoder {
+ public:
+ Utf16BeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read16be(data, offset);
+ }
+};
+
+class Utf16LeDecoder : public UtfDecoder {
+ public:
+ Utf16LeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read16le(data, offset);
+ }
+};
+
+class Utf32BeDecoder : public UtfDecoder {
+ public:
+ Utf32BeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read32be(data, offset);
+ }
+};
+
+class Utf32LeDecoder : public UtfDecoder {
+ public:
+ Utf32LeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read32le(data, offset);
+ }
+};
+
+class Utf16Decoder : public Decoder {
+ public:
+ Utf16Decoder() = default;
+
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ if (endian_ == -1) UNLIKELY {
+ std::size_t tmp = in_offset;
+ uint32_t ret = utf::read16be(in, tmp);
+ int8_t endian;
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+ if (ret == 0xfeff) {
+ endian = 1; // Big endian
+ } else if (ret == 0xfffe) {
+ endian = 0; // Little endian
+ } else {
+ return State::INVALID;
+ }
+
+ // To allow offset to advance and to return, we need to
+ // read at least one more character completely.
+ ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+
+ endian_ = endian;
+ in_offset = tmp;
+ out[out_offset++] = ret;
+ if (out_offset == out_size)
+ return State::GOOD;
+ }
+
+ if (endian_ == 1) {
+ do {
+ uint32_t ret = utf::read16be(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ } else {
+ do {
+ uint32_t ret = utf::read16le(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ }
+ return State::GOOD;
+ }
+
+ private:
+ int8_t endian_{-1};
+};
+
+class Utf32Decoder : public Decoder {
+ public:
+ Utf32Decoder() = default;
+
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ if (endian_ == -1) UNLIKELY {
+ std::size_t tmp = in_offset;
+ uint32_t ret = utf::read32be(in, tmp);
+ int8_t endian;
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ tmp = in_offset;
+ ret = utf::read32le(in, tmp);
+ if (ret == 0xfeff) {
+ endian = 0; // Little endian
+ } else {
+ return State::INVALID;
+ }
+ } else if (ret == 0xfeff) {
+ endian = 1; // Big endian
+ } else {
+ return State::INVALID;
+ }
+
+ // To allow offset to advance and to return, we need to
+ // read the next character completely.
+ ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+
+ endian_ = endian;
+ in_offset = tmp;
+ out[out_offset++] = ret;
+ if (out_offset == out_size)
+ return State::GOOD;
+ }
+
+ if (endian_ == 1) {
+ do {
+ uint32_t ret = utf::read32be(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ } else {
+ do {
+ uint32_t ret = utf::read32le(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ }
+ return State::GOOD;
+ }
+
+ private:
+ int8_t endian_{-1};
+};
+
+class AsciiDecoder : public Decoder {
+ public:
+ AsciiDecoder() = default;
+
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ do {
+ if (in_offset == in.size())
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ if (in[in_offset] & 0x80)
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ out[out_offset++] = in[in_offset++];
+ } while (out_offset < out_size);
+ return State::GOOD;
+ }
+};
+
+} // namespace
+
+std::unique_ptr<Decoder> create_utf8_decoder() {
+ return std::make_unique<Utf8Decoder>();
+}
+
+std::unique_ptr<Decoder> create_utf16be_decoder() {
+ return std::make_unique<Utf16BeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf16le_decoder() {
+ return std::make_unique<Utf16LeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf32be_decoder() {
+ return std::make_unique<Utf32BeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf32le_decoder() {
+ return std::make_unique<Utf32LeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf16_decoder() {
+ return std::make_unique<Utf16Decoder>();
+}
+
+std::unique_ptr<Decoder> create_utf32_decoder() {
+ return std::make_unique<Utf32Decoder>();
+}
+
+std::unique_ptr<Decoder> create_ascii_decoder() {
+ return std::make_unique<AsciiDecoder>();
+}
+
+} // namespace sax
+} // namespace modxml
+
diff --git a/sax/src/decoder.hh b/sax/src/decoder.hh
new file mode 100644
index 0000000..bd2a99a
--- /dev/null
+++ b/sax/src/decoder.hh
@@ -0,0 +1,33 @@
+#ifndef DECODER_HH
+#define DECODER_HH
+
+#include "macros.hh"
+
+#include <memory>
+
+namespace modxml {
+namespace sax {
+
+class Decoder;
+
+// UTF-8 with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf8_decoder();
+// UTF-16 with BOM
+std::unique_ptr<Decoder> HIDDEN create_utf16_decoder();
+// UTF-16BE with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf16be_decoder();
+// UTF-16LE with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf16le_decoder();
+// UTF-32 with BOM
+std::unique_ptr<Decoder> HIDDEN create_utf32_decoder();
+// UTF-32BE with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf32be_decoder();
+// UTF-32LE with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf32le_decoder();
+// US-ASCII
+std::unique_ptr<Decoder> HIDDEN create_ascii_decoder();
+
+} // namespace sax
+} // namespace modxml
+
+#endif // DECODER_HH
diff --git a/sax/src/processor.hh b/sax/src/processor.hh
new file mode 100644
index 0000000..4a2de29
--- /dev/null
+++ b/sax/src/processor.hh
@@ -0,0 +1,27 @@
+#ifndef PROCESSOR_HH
+#define PROCESSOR_HH
+
+#include "macros.hh"
+
+#include <memory>
+#include <optional>
+#include <string>
+
+namespace modxml {
+namespace sax {
+
+class DecoderFactory;
+class Delegate;
+class Processor;
+
+std::unique_ptr<Processor> HIDDEN create_processor(
+ std::shared_ptr<Delegate> delegate,
+ std::shared_ptr<DecoderFactory> decoder_factory,
+ std::optional<std::string> force_encoding,
+ std::optional<std::size_t> default_buffer_size,
+ std::optional<std::size_t> max_buffer_size);
+
+} // namespace sax
+} // namespace modxml
+
+#endif // PROCESSOR_HH
diff --git a/sax/src/sax_processor.cc b/sax/src/sax_processor.cc
new file mode 100644
index 0000000..ea9f753
--- /dev/null
+++ b/sax/src/sax_processor.cc
@@ -0,0 +1,145 @@
+#include "sax_processor.hh"
+
+#include "sax_decoder.hh"
+#include "processor.hh"
+#include "utils.hh"
+
+#include <algorithm>
+#include <optional>
+#include <utility>
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+// 2.2 Characters
+// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+
+inline bool valid_char(uint32_t c) {
+ // Assume valid unicode (U+0 - U+10ffff except surrogate blocks)
+ if (c >= 0x20 && c <= 0xfffd)
+ return true;
+ if (c == 0x9 || c == 0xa || c == 0xd)
+ return true;
+ return c >= 0x10000;
+}
+
+// 2.3 Common Syntactic Constructs
+// [3] S ::= (#x20 | #x9 | #xD | #xA)+
+
+inline bool is_ws(uint32_t c) {
+ // Assume we already checked for valid_char.
+ return c <= 0x20;
+}
+
+// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
+// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+
+inline bool is_namestartchar(uint32_t c) {
+ if (c < 0x41 /* A */)
+ return c == 0x3a /* : */;
+ if (c <= 0x5a /* Z */)
+ return true;
+ if (c < 0x61 /* a */)
+ return c == 0x5f /* _ */;
+ if (c <= 0x7a /* z */)
+ return true;
+ if (c < 0xc0)
+ return false;
+ if (c < 0x300)
+ return c != 0xd7 && c != 0xf7;
+ if (c > 0x37d && c < 0x37f)
+ return false;
+ if (c > 0x1fff && c < 0x200c)
+ return false;
+ if (c > 0x200d && c < 0x2070)
+ return false;
+ if (c > 0x218f && c < 0x2c00)
+ return false;
+ if (c > 0x2fef && c < 0x3001)
+ return false;
+ // Already valid_char so don't check for surrogate pair here.
+ if (c > 0xdfff && c < 0xf900)
+ return false;
+ if (c > 0xfdcf && c < 0xfdf0)
+ return false;
+ if (c > 0xfffd && c < 0x10000)
+ return false;
+ return true;
+}
+
+inline bool is_namechar(uint32_t c) {
+ return is_namestartchar(c) || (c >= 0x2d /* - */ && c <= 0x2e /* . */) ||
+ (c >= 0x30 /* 0 */ && c <= 0x39 /* 9 */) || (c == 0xb7) ||
+ (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040);
+}
+
+/* [5] Name ::= NameStartChar (NameChar)*
+[6] Names ::= Name (#x20 Name)*
+[7] Nmtoken ::= (NameChar)+
+[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
+*/
+
+class ProcessorImpl : public Processor {
+ public:
+ ProcessorImpl(std::shared_ptr<Delegate> delegate,
+ std::shared_ptr<DecoderFactory> decoder_factory,
+ std::unique_ptr<Decoder> decoder,
+ std::size_t default_buffer_size,
+ std::size_t max_buffer_size)
+ : delegate_(std::move(delegate)),
+ decoder_factory_(std::move(decoder_factory)),
+ decoder_(std::move(decoder)),
+ default_buffer_size_(default_buffer_size),
+ max_buffer_size_(max_buffer_size) {}
+
+ private:
+ std::shared_ptr<Delegate> delegate_;
+ std::shared_ptr<DecoderFactory> decoder_factory_;
+ std::unique_ptr<Decoder> decoder_;
+ std::size_t default_buffer_size_;
+ std::size_t max_buffer_size_;
+};
+
+} // namespace
+
+std::unique_ptr<Processor> create_processor(
+ std::shared_ptr<Delegate> delegate,
+ std::shared_ptr<DecoderFactory> decoder_factory,
+ std::optional<std::string> force_encoding,
+ std::optional<std::size_t> opt_default_buffer_size,
+ std::optional<std::size_t> opt_max_buffer_size) {
+
+ std::unique_ptr<Decoder> decoder;
+ if (force_encoding.has_value()) {
+ decoder = pick_decoder_for_encoding(force_encoding.value(),
+ decoder_factory.get());
+ }
+
+ std::size_t default_buffer_size = 8192;
+ if (opt_default_buffer_size.has_value())
+ default_buffer_size = std::max(static_cast<std::size_t>(128),
+ opt_default_buffer_size.value());
+ // This value is documented in public headers. Do NOT change.
+ std::size_t max_buffer_size = 10 * 1024 * 1024;
+ // No validation for user set value. If it is too small MAX_MEMORY_EXCEEDED
+ // error will be thrown. If it is too large we will get OUT_OF_MEMORY or
+ // crash depending on platform.
+ if (opt_max_buffer_size.has_value())
+ max_buffer_size = opt_max_buffer_size.value();
+
+ return std::make_unique<ProcessorImpl>(std::move(delegate),
+ std::move(decoder_factory),
+ std::move(decoder),
+ default_buffer_size,
+ max_buffer_size);
+}
+
+std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate) {
+ return create_processor(std::move(delegate), nullptr,
+ std::nullopt, std::nullopt, std::nullopt);
+}
+
+} // namespace sax
+} // namespace modxml
diff --git a/sax/src/sax_processor_builder.cc b/sax/src/sax_processor_builder.cc
new file mode 100644
index 0000000..8817099
--- /dev/null
+++ b/sax/src/sax_processor_builder.cc
@@ -0,0 +1,62 @@
+#include "sax_processor_builder.hh"
+
+#include "processor.hh"
+#include "sax_processor.hh"
+
+#include <optional>
+#include <utility>
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+class ProcessorBuilderImpl : public ProcessorBuilder {
+ public:
+ ProcessorBuilder* force_encoding(std::string const& str) override {
+ force_encoding_ = str;
+ return this;
+ }
+
+ ProcessorBuilder* custom_decoder_factory(
+ std::shared_ptr<DecoderFactory> custom_decoder_factory) override {
+ decoder_factory_ = std::move(custom_decoder_factory);
+ return this;
+ }
+
+ ProcessorBuilder* set_default_buffer_size(std::size_t size) override {
+ default_buffer_size_ = size;
+ return this;
+ }
+
+ ProcessorBuilder* set_max_buffer_size(std::size_t size) override {
+ max_buffer_size_ = size;
+ return this;
+ }
+
+ std::unique_ptr<Processor> build(
+ std::shared_ptr<Delegate> delegate) const override {
+ return create_processor(std::move(delegate),
+ decoder_factory_,
+ force_encoding_,
+ default_buffer_size_,
+ max_buffer_size_);
+ }
+
+ ProcessorBuilderImpl() = default;
+
+ private:
+ std::shared_ptr<DecoderFactory> decoder_factory_;
+ std::optional<std::string> force_encoding_;
+ std::optional<std::size_t> default_buffer_size_;
+ std::optional<std::size_t> max_buffer_size_;
+};
+
+} // namespace
+
+std::unique_ptr<ProcessorBuilder> ProcessorBuilder::create() {
+ return std::make_unique<ProcessorBuilderImpl>();
+}
+
+} // namespace sax
+} // namespace modxml
diff --git a/sax/src/utils.cc b/sax/src/utils.cc
new file mode 100644
index 0000000..f0366d5
--- /dev/null
+++ b/sax/src/utils.cc
@@ -0,0 +1,70 @@
+#include "utils.hh"
+
+#include "decoder.hh"
+#include "sax_decoder.hh"
+#include "sax_decoder_factory.hh"
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+std::string cleanup_encoding(std::string const& str) {
+ std::string ret;
+ ret.reserve(str.size());
+ for (auto c : str) {
+ if (c >= 'A' && c <= 'Z') {
+ ret.push_back(c | 0x20);
+ } else if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
+ ret.push_back(c);
+ } else if (c == '.' || c == '_' || c == '-') {
+ ret.push_back('-');
+ }
+ }
+ return ret;
+}
+
+} // namespace
+
+// Names inspired by:
+// https://www.iana.org/assignments/character-sets/character-sets.xhtml
+std::unique_ptr<Decoder> pick_decoder_for_encoding(
+ std::string const& encoding, DecoderFactory* factory) {
+ auto clean_enc = cleanup_encoding(encoding);
+ if (clean_enc == "utf-8" || clean_enc == "utf8") {
+ return create_utf8_decoder();
+ }
+ if (clean_enc == "utf-16" || clean_enc == "utf16") {
+ return create_utf16_decoder();
+ }
+ if (clean_enc == "utf-16be" || clean_enc == "utf16be") {
+ return create_utf16be_decoder();
+ }
+ if (clean_enc == "utf-16le" || clean_enc == "utf16le") {
+ return create_utf16le_decoder();
+ }
+ if (clean_enc == "utf-32" || clean_enc == "utf32") {
+ return create_utf32_decoder();
+ }
+ if (clean_enc == "utf-32be" || clean_enc == "utf32be") {
+ return create_utf32be_decoder();
+ }
+ if (clean_enc == "utf-32le" || clean_enc == "utf32le") {
+ return create_utf32le_decoder();
+ }
+ if (clean_enc == "ascii" || clean_enc == "us-ascii" ||
+ clean_enc == "usascii" || clean_enc == "iso-ir-6" ||
+ clean_enc == "ansi-x3-4-1968" || clean_enc == "ansi-x3-4-1986" ||
+ clean_enc == "iso-646-irv1991" || clean_enc == "iso646-us" ||
+ clean_enc == "us" || clean_enc == "ibm367" || clean_enc == "cp367") {
+ return create_ascii_decoder();
+ }
+ if (factory) {
+ return factory->create(encoding);
+ }
+ return nullptr;
+}
+
+} // namespace sax
+
+} // namespace modxml
diff --git a/sax/src/utils.hh b/sax/src/utils.hh
new file mode 100644
index 0000000..206d003
--- /dev/null
+++ b/sax/src/utils.hh
@@ -0,0 +1,22 @@
+#ifndef UTILS_HH
+#define UTILS_HH
+
+#include "macros.hh"
+
+#include <memory>
+#include <string>
+
+namespace modxml {
+namespace sax {
+
+class Decoder;
+class DecoderFactory;
+
+std::unique_ptr<Decoder> HIDDEN pick_decoder_for_encoding(
+ std::string const& encoding,
+ DecoderFactory* factory);
+
+} // namespace sax
+} // namespace modxml
+
+#endif // UTILS_HH