summaryrefslogtreecommitdiff
path: root/sax
diff options
context:
space:
mode:
Diffstat (limited to 'sax')
-rw-r--r--sax/inc/sax_decoder.hh57
-rw-r--r--sax/inc/sax_decoder_factory.hh35
-rw-r--r--sax/inc/sax_delegate.hh22
-rw-r--r--sax/inc/sax_error.hh36
-rw-r--r--sax/inc/sax_processor.hh37
-rw-r--r--sax/inc/sax_processor_builder.hh82
-rw-r--r--sax/meson.build22
-rw-r--r--sax/src/decoder.cc321
-rw-r--r--sax/src/decoder.hh33
-rw-r--r--sax/src/processor.hh27
-rw-r--r--sax/src/sax_processor.cc145
-rw-r--r--sax/src/sax_processor_builder.cc62
-rw-r--r--sax/src/utils.cc70
-rw-r--r--sax/src/utils.hh22
14 files changed, 971 insertions, 0 deletions
diff --git a/sax/inc/sax_decoder.hh b/sax/inc/sax_decoder.hh
new file mode 100644
index 0000000..40a56c9
--- /dev/null
+++ b/sax/inc/sax_decoder.hh
@@ -0,0 +1,57 @@
+#ifndef SAX_DECODER_HH
+#define SAX_DECODER_HH
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+namespace modxml {
+namespace sax {
+
+/**
+ * Decoder returned by DecoderFactory. Used by Processor to turn bytes into
+ * unicode characters.
+ */
+class Decoder {
+ public:
+ virtual ~Decoder() = default;
+
+ enum class State {
+ GOOD = 0,
+ // too little data was given to advance
+ NEED_MORE,
+ // invalid data was given to advance
+ INVALID,
+ };
+
+ /**
+ * Decode as many code points as possible from in (start at in_offset) and
+ * write them to out (start at out_offset) as UTF-8.
+ * All written code points must be valid per Unicode, so inside the
+ * range U+0 to U+10FFFF and not a surrogate pair (U+D800-U+DFFF).
+ * No partial output, only write to out if the whole UTF-8 sequence is
+ * going to fit.
+ * The is always at least 4 bytes available (out.size() - out_offset) when
+ * called.
+ * Advance in_offset for data consumed.
+ * Advance out_offset for code points written. Do NOT write past out.size().
+ * Do NOT resize out.
+ * If at least one code point is decoded and written to out, return GOOD.
+ * If it is not possible to decode a single code point, in_offset and
+ * out_offset should not be advanced and something other than GOOD returned.
+ * Do not keep any references to any of the parameters after returning, next
+ * advance() call will point to the following bytes, but all parameters
+ * may have changed as they are subject to the buffer implementations of the
+ * Processor.
+ */
+ virtual State decode(std::string_view in, std::size_t& in_offset,
+ std::string& out, std::size_t& out_offset) = 0;
+
+ protected:
+ Decoder() = default;
+};
+
+} // namespace sax
+} // namespace modxml
+
+#endif // SAX_DECODER_HH
diff --git a/sax/inc/sax_decoder_factory.hh b/sax/inc/sax_decoder_factory.hh
new file mode 100644
index 0000000..80f1af3
--- /dev/null
+++ b/sax/inc/sax_decoder_factory.hh
@@ -0,0 +1,35 @@
+#ifndef SAX_DECODER_FACTORY_HH
+#define SAX_DECODER_FACTORY_HH
+
+#include <memory>
+#include <string>
+
+namespace modxml {
+namespace sax {
+
+class Decoder;
+
+/**
+ * Factory for decoders. You can give one to ProcessBuilder.
+ */
+class DecoderFactory {
+ public:
+ virtual ~DecoderFactory() = default;
+
+ /**
+ * If encoding is supported, return a decoder for that encoding.
+ * Return nullptr if not supported and Processor will return
+ * UNKNOWN_ENCODING error.
+ * Note that encoding value isn't cleaned up or validated in any way, it is
+ * reported EXACTLY as found (even if not valid per XML spec).
+ */
+ virtual std::unique_ptr<Decoder> create(std::string const& encoding) = 0;
+
+ protected:
+ DecoderFactory() = default;
+};
+
+} // namespace sax
+} // namespace modxml
+
+#endif // SAX_DECODER_FACTORY_HH
diff --git a/sax/inc/sax_delegate.hh b/sax/inc/sax_delegate.hh
new file mode 100644
index 0000000..ba63e72
--- /dev/null
+++ b/sax/inc/sax_delegate.hh
@@ -0,0 +1,22 @@
+#ifndef MODXML_SAX_DELEGATE_HH
+#define MODXML_SAX_DELEGATE_HH
+
+namespace modxml {
+namespace sax {
+
+/**
+ * Delegate for processor.
+ * Implement to handle events.
+ */
+class Delegate {
+ public:
+ virtual ~Delegate() = default;
+
+ protected:
+ Delegate() = default;
+};
+
+} // namespace sax
+} // namespace modxml
+
+#endif // MODXML_SAX_DELEGATE_HH
diff --git a/sax/inc/sax_error.hh b/sax/inc/sax_error.hh
new file mode 100644
index 0000000..748f995
--- /dev/null
+++ b/sax/inc/sax_error.hh
@@ -0,0 +1,36 @@
+#ifndef MODXML_SAX_ERROR_HH
+#define MODXML_SAX_ERROR_HH
+
+namespace modxml {
+namespace sax {
+
+enum class Error {
+ /**
+ * The XML spec has a list of characters that are never allowed in a document.
+ */
+ INVALID_CHAR,
+ /**
+ * If the document encoding is unsupported or unkown.
+ */
+ UNKNOWN_ENCODING,
+ /**
+ * If the document is incomplete. The is one of the few recoverable errors,
+ * if you call the processor with more data it will continue.
+ */
+ INCOMPLETE,
+ /**
+ * A entity in the document exeeded max buffer size (either set by
+ * ProcessBuilder or the default 10 MiB).
+ */
+ MAX_MEMORY_EXCEEDED,
+ /**
+ * A memory allocation failed. Note that this doesn't protect against
+ * usage of overallocated memory.
+ */
+ OUT_OF_MEMORY,
+};
+
+} // namespace sax
+} // namespace modxml
+
+#endif // MODXML_SAX_ERROR_HH
diff --git a/sax/inc/sax_processor.hh b/sax/inc/sax_processor.hh
new file mode 100644
index 0000000..7ca32f7
--- /dev/null
+++ b/sax/inc/sax_processor.hh
@@ -0,0 +1,37 @@
+#ifndef MODXML_SAX_PROCESSOR_HH
+#define MODXML_SAX_PROCESSOR_HH
+
+#include <memory>
+
+namespace modxml {
+namespace sax {
+
+class Delegate;
+
+/**
+ * The XML processor, or parser if you like that term better.
+ * Feed it data and the processor will give the delegate calls with events or
+ * possibly errors.
+ */
+class Processor {
+ public:
+ virtual ~Processor() = default;
+
+ /**
+ * Construct a Processor. Same as creating a ProcessorBuilder
+ * and not changing any options and just calling build.
+ */
+ static std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate);
+
+ protected:
+ Processor() = default;
+
+ private:
+ Processor(Processor const&) = delete;
+ Processor& operator=(Processor const&) = delete;
+};
+
+} // namespace sax
+} // namespace modxml
+
+#endif // MODXML_SAX_PROCESSOR_HH
diff --git a/sax/inc/sax_processor_builder.hh b/sax/inc/sax_processor_builder.hh
new file mode 100644
index 0000000..070fbbf
--- /dev/null
+++ b/sax/inc/sax_processor_builder.hh
@@ -0,0 +1,82 @@
+#ifndef MODXML_SAX_PROCESSOR_BUILDER_HH
+#define MODXML_SAX_PROCESSOR_BUILDER_HH
+
+#include <memory>
+#include <string>
+
+namespace modxml {
+namespace sax {
+
+class DecoderFactory;
+class Delegate;
+class Processor;
+
+/**
+ * Used to construct Processor's with options set if needed.
+ */
+class ProcessorBuilder {
+ public:
+ virtual ~ProcessorBuilder() = default;
+
+ /**
+ * Construct a ProcessorBuilder. All options are set to default.
+ */
+ static std::unique_ptr<ProcessorBuilder> create();
+
+ /**
+ * If you know the encoding of the data sent to the processor set it here,
+ * this will stop the processor from trying to autodetect and will ignore
+ * encoding in any xml declaration if found.
+ * If the encoding is unsupported/unknown the processor will fail with
+ * an error indicating this, same as if it read a xml declaration with
+ * an unsupported or unknown encoding.
+ */
+ virtual ProcessorBuilder* force_encoding(std::string const& str) = 0;
+
+ /**
+ * Set a decoder factory for encodings not supported by library.
+ * Library only calls this for encodings it doesn't support itself.
+ * Library supports UTF-8, UTF-16, UTF-32 and US-ASCII.
+ * If you want to force the decoder factory to be used, force a custom
+ * encoding with force_encoding above.
+ */
+ virtual ProcessorBuilder* custom_decoder_factory(
+ std::shared_ptr<DecoderFactory> custom_decoder_factory) = 0;
+
+ /**
+ * Set the default buffer size the processor should use.
+ * If you give a too small buffer size (such as zero) it will be ignored
+ * and a implementation specific minimum will be used instead.
+ * This is meant as a possible optimization and can be completely ignored.
+ * Note that the processor will allocate more data if it needed.
+ */
+ virtual ProcessorBuilder* set_default_buffer_size(std::size_t size) = 0;
+
+ /**
+ * Set the max buffer size the processor should use.
+ * If you have memory constraints this will block the processing of CDATA,
+ * or other entities from allocating more than the given size.
+ * Default is 10MiB.
+ */
+ virtual ProcessorBuilder* set_max_buffer_size(std::size_t size) = 0;
+
+ /**
+ * Call to construct a Processor with the options setup in this builder,
+ * using the delegate given as parameter.
+ * May be called multiple times, will create an unique Processor each time.
+ */
+ virtual std::unique_ptr<Processor> build(
+ std::shared_ptr<Delegate> delegate) const = 0;
+
+ protected:
+ ProcessorBuilder() = default;
+
+ private:
+ ProcessorBuilder(ProcessorBuilder const&) = delete;
+ ProcessorBuilder& operator=(ProcessorBuilder const&) = delete;
+};
+
+} // namespace sax
+} // namespace modxml
+
+#endif // MODXML_SAX_PROCESSOR_BUILDER_HH
diff --git a/sax/meson.build b/sax/meson.build
new file mode 100644
index 0000000..ccbdef4
--- /dev/null
+++ b/sax/meson.build
@@ -0,0 +1,22 @@
+deps = [
+ base_dep,
+ utf_dep,
+]
+
+inc = include_directories('inc')
+lib = shared_library(
+ 'modxmlsax',
+ 'src/decoder.cc',
+ 'src/sax_processor.cc',
+ 'src/sax_processor_builder.cc',
+ 'src/utils.cc',
+ dependencies: deps,
+ include_directories: inc,
+ install: true,
+)
+
+sax_dep = declare_dependency(
+ dependencies: deps,
+ include_directories: inc,
+ link_with: lib,
+)
diff --git a/sax/src/decoder.cc b/sax/src/decoder.cc
new file mode 100644
index 0000000..30b1735
--- /dev/null
+++ b/sax/src/decoder.cc
@@ -0,0 +1,321 @@
+#include "decoder.hh"
+
+#include "macros.hh"
+#include "sax_decoder.hh"
+#include "utf16.hh"
+#include "utf32.hh"
+#include "utf8.hh"
+#include "utf_error.hh"
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+class UtfDecoder : public Decoder {
+ public:
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ if (bom_ == -1) UNLIKELY {
+ std::size_t tmp = in_offset;
+ uint32_t ret = read(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+ if (ret == 0xfeff) {
+ // To allow offset to advance and to return, we need to
+ // read at least one more character completely.
+ ret = read(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+ bom_ = 1;
+ } else {
+ bom_ = 0;
+ }
+ in_offset = tmp;
+ out[out_offset++] = ret;
+ if (out_offset == out_size)
+ return State::GOOD;
+ }
+
+ do {
+ uint32_t ret = read(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ return State::GOOD;
+ }
+
+ protected:
+ UtfDecoder() = default;
+
+ virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0;
+
+ private:
+ int8_t bom_{-1};
+};
+
+class Utf8Decoder : public UtfDecoder {
+ public:
+ Utf8Decoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read8(data, offset);
+ }
+};
+
+class Utf16BeDecoder : public UtfDecoder {
+ public:
+ Utf16BeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read16be(data, offset);
+ }
+};
+
+class Utf16LeDecoder : public UtfDecoder {
+ public:
+ Utf16LeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read16le(data, offset);
+ }
+};
+
+class Utf32BeDecoder : public UtfDecoder {
+ public:
+ Utf32BeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read32be(data, offset);
+ }
+};
+
+class Utf32LeDecoder : public UtfDecoder {
+ public:
+ Utf32LeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read32le(data, offset);
+ }
+};
+
+class Utf16Decoder : public Decoder {
+ public:
+ Utf16Decoder() = default;
+
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ if (endian_ == -1) UNLIKELY {
+ std::size_t tmp = in_offset;
+ uint32_t ret = utf::read16be(in, tmp);
+ int8_t endian;
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+ if (ret == 0xfeff) {
+ endian = 1; // Big endian
+ } else if (ret == 0xfffe) {
+ endian = 0; // Little endian
+ } else {
+ return State::INVALID;
+ }
+
+ // To allow offset to advance and to return, we need to
+ // read at least one more character completely.
+ ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+
+ endian_ = endian;
+ in_offset = tmp;
+ out[out_offset++] = ret;
+ if (out_offset == out_size)
+ return State::GOOD;
+ }
+
+ if (endian_ == 1) {
+ do {
+ uint32_t ret = utf::read16be(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ } else {
+ do {
+ uint32_t ret = utf::read16le(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ }
+ return State::GOOD;
+ }
+
+ private:
+ int8_t endian_{-1};
+};
+
+class Utf32Decoder : public Decoder {
+ public:
+ Utf32Decoder() = default;
+
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ if (endian_ == -1) UNLIKELY {
+ std::size_t tmp = in_offset;
+ uint32_t ret = utf::read32be(in, tmp);
+ int8_t endian;
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ tmp = in_offset;
+ ret = utf::read32le(in, tmp);
+ if (ret == 0xfeff) {
+ endian = 0; // Little endian
+ } else {
+ return State::INVALID;
+ }
+ } else if (ret == 0xfeff) {
+ endian = 1; // Big endian
+ } else {
+ return State::INVALID;
+ }
+
+ // To allow offset to advance and to return, we need to
+ // read the next character completely.
+ ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+
+ endian_ = endian;
+ in_offset = tmp;
+ out[out_offset++] = ret;
+ if (out_offset == out_size)
+ return State::GOOD;
+ }
+
+ if (endian_ == 1) {
+ do {
+ uint32_t ret = utf::read32be(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ } else {
+ do {
+ uint32_t ret = utf::read32le(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ }
+ return State::GOOD;
+ }
+
+ private:
+ int8_t endian_{-1};
+};
+
+class AsciiDecoder : public Decoder {
+ public:
+ AsciiDecoder() = default;
+
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ do {
+ if (in_offset == in.size())
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ if (in[in_offset] & 0x80)
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ out[out_offset++] = in[in_offset++];
+ } while (out_offset < out_size);
+ return State::GOOD;
+ }
+};
+
+} // namespace
+
+std::unique_ptr<Decoder> create_utf8_decoder() {
+ return std::make_unique<Utf8Decoder>();
+}
+
+std::unique_ptr<Decoder> create_utf16be_decoder() {
+ return std::make_unique<Utf16BeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf16le_decoder() {
+ return std::make_unique<Utf16LeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf32be_decoder() {
+ return std::make_unique<Utf32BeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf32le_decoder() {
+ return std::make_unique<Utf32LeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf16_decoder() {
+ return std::make_unique<Utf16Decoder>();
+}
+
+std::unique_ptr<Decoder> create_utf32_decoder() {
+ return std::make_unique<Utf32Decoder>();
+}
+
+std::unique_ptr<Decoder> create_ascii_decoder() {
+ return std::make_unique<AsciiDecoder>();
+}
+
+} // namespace sax
+} // namespace modxml
+
diff --git a/sax/src/decoder.hh b/sax/src/decoder.hh
new file mode 100644
index 0000000..bd2a99a
--- /dev/null
+++ b/sax/src/decoder.hh
@@ -0,0 +1,33 @@
+#ifndef DECODER_HH
+#define DECODER_HH
+
+#include "macros.hh"
+
+#include <memory>
+
+namespace modxml {
+namespace sax {
+
+class Decoder;
+
+// UTF-8 with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf8_decoder();
+// UTF-16 with BOM
+std::unique_ptr<Decoder> HIDDEN create_utf16_decoder();
+// UTF-16BE with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf16be_decoder();
+// UTF-16LE with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf16le_decoder();
+// UTF-32 with BOM
+std::unique_ptr<Decoder> HIDDEN create_utf32_decoder();
+// UTF-32BE with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf32be_decoder();
+// UTF-32LE with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf32le_decoder();
+// US-ASCII
+std::unique_ptr<Decoder> HIDDEN create_ascii_decoder();
+
+} // namespace sax
+} // namespace modxml
+
+#endif // DECODER_HH
diff --git a/sax/src/processor.hh b/sax/src/processor.hh
new file mode 100644
index 0000000..4a2de29
--- /dev/null
+++ b/sax/src/processor.hh
@@ -0,0 +1,27 @@
+#ifndef PROCESSOR_HH
+#define PROCESSOR_HH
+
+#include "macros.hh"
+
+#include <memory>
+#include <optional>
+#include <string>
+
+namespace modxml {
+namespace sax {
+
+class DecoderFactory;
+class Delegate;
+class Processor;
+
+std::unique_ptr<Processor> HIDDEN create_processor(
+ std::shared_ptr<Delegate> delegate,
+ std::shared_ptr<DecoderFactory> decoder_factory,
+ std::optional<std::string> force_encoding,
+ std::optional<std::size_t> default_buffer_size,
+ std::optional<std::size_t> max_buffer_size);
+
+} // namespace sax
+} // namespace modxml
+
+#endif // PROCESSOR_HH
diff --git a/sax/src/sax_processor.cc b/sax/src/sax_processor.cc
new file mode 100644
index 0000000..ea9f753
--- /dev/null
+++ b/sax/src/sax_processor.cc
@@ -0,0 +1,145 @@
+#include "sax_processor.hh"
+
+#include "sax_decoder.hh"
+#include "processor.hh"
+#include "utils.hh"
+
+#include <algorithm>
+#include <optional>
+#include <utility>
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+// 2.2 Characters
+// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+
+inline bool valid_char(uint32_t c) {
+ // Assume valid unicode (U+0 - U+10ffff except surrogate blocks)
+ if (c >= 0x20 && c <= 0xfffd)
+ return true;
+ if (c == 0x9 || c == 0xa || c == 0xd)
+ return true;
+ return c >= 0x10000;
+}
+
+// 2.3 Common Syntactic Constructs
+// [3] S ::= (#x20 | #x9 | #xD | #xA)+
+
+inline bool is_ws(uint32_t c) {
+ // Assume we already checked for valid_char.
+ return c <= 0x20;
+}
+
+// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
+// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+
+inline bool is_namestartchar(uint32_t c) {
+ if (c < 0x41 /* A */)
+ return c == 0x3a /* : */;
+ if (c <= 0x5a /* Z */)
+ return true;
+ if (c < 0x61 /* a */)
+ return c == 0x5f /* _ */;
+ if (c <= 0x7a /* z */)
+ return true;
+ if (c < 0xc0)
+ return false;
+ if (c < 0x300)
+ return c != 0xd7 && c != 0xf7;
+ if (c > 0x37d && c < 0x37f)
+ return false;
+ if (c > 0x1fff && c < 0x200c)
+ return false;
+ if (c > 0x200d && c < 0x2070)
+ return false;
+ if (c > 0x218f && c < 0x2c00)
+ return false;
+ if (c > 0x2fef && c < 0x3001)
+ return false;
+ // Already valid_char so don't check for surrogate pair here.
+ if (c > 0xdfff && c < 0xf900)
+ return false;
+ if (c > 0xfdcf && c < 0xfdf0)
+ return false;
+ if (c > 0xfffd && c < 0x10000)
+ return false;
+ return true;
+}
+
+inline bool is_namechar(uint32_t c) {
+ return is_namestartchar(c) || (c >= 0x2d /* - */ && c <= 0x2e /* . */) ||
+ (c >= 0x30 /* 0 */ && c <= 0x39 /* 9 */) || (c == 0xb7) ||
+ (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040);
+}
+
+/* [5] Name ::= NameStartChar (NameChar)*
+[6] Names ::= Name (#x20 Name)*
+[7] Nmtoken ::= (NameChar)+
+[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
+*/
+
+class ProcessorImpl : public Processor {
+ public:
+ ProcessorImpl(std::shared_ptr<Delegate> delegate,
+ std::shared_ptr<DecoderFactory> decoder_factory,
+ std::unique_ptr<Decoder> decoder,
+ std::size_t default_buffer_size,
+ std::size_t max_buffer_size)
+ : delegate_(std::move(delegate)),
+ decoder_factory_(std::move(decoder_factory)),
+ decoder_(std::move(decoder)),
+ default_buffer_size_(default_buffer_size),
+ max_buffer_size_(max_buffer_size) {}
+
+ private:
+ std::shared_ptr<Delegate> delegate_;
+ std::shared_ptr<DecoderFactory> decoder_factory_;
+ std::unique_ptr<Decoder> decoder_;
+ std::size_t default_buffer_size_;
+ std::size_t max_buffer_size_;
+};
+
+} // namespace
+
+std::unique_ptr<Processor> create_processor(
+ std::shared_ptr<Delegate> delegate,
+ std::shared_ptr<DecoderFactory> decoder_factory,
+ std::optional<std::string> force_encoding,
+ std::optional<std::size_t> opt_default_buffer_size,
+ std::optional<std::size_t> opt_max_buffer_size) {
+
+ std::unique_ptr<Decoder> decoder;
+ if (force_encoding.has_value()) {
+ decoder = pick_decoder_for_encoding(force_encoding.value(),
+ decoder_factory.get());
+ }
+
+ std::size_t default_buffer_size = 8192;
+ if (opt_default_buffer_size.has_value())
+ default_buffer_size = std::max(static_cast<std::size_t>(128),
+ opt_default_buffer_size.value());
+ // This value is documented in public headers. Do NOT change.
+ std::size_t max_buffer_size = 10 * 1024 * 1024;
+ // No validation for user set value. If it is too small MAX_MEMORY_EXCEEDED
+ // error will be thrown. If it is too large we will get OUT_OF_MEMORY or
+ // crash depending on platform.
+ if (opt_max_buffer_size.has_value())
+ max_buffer_size = opt_max_buffer_size.value();
+
+ return std::make_unique<ProcessorImpl>(std::move(delegate),
+ std::move(decoder_factory),
+ std::move(decoder),
+ default_buffer_size,
+ max_buffer_size);
+}
+
+std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate) {
+ return create_processor(std::move(delegate), nullptr,
+ std::nullopt, std::nullopt, std::nullopt);
+}
+
+} // namespace sax
+} // namespace modxml
diff --git a/sax/src/sax_processor_builder.cc b/sax/src/sax_processor_builder.cc
new file mode 100644
index 0000000..8817099
--- /dev/null
+++ b/sax/src/sax_processor_builder.cc
@@ -0,0 +1,62 @@
+#include "sax_processor_builder.hh"
+
+#include "processor.hh"
+#include "sax_processor.hh"
+
+#include <optional>
+#include <utility>
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+class ProcessorBuilderImpl : public ProcessorBuilder {
+ public:
+ ProcessorBuilder* force_encoding(std::string const& str) override {
+ force_encoding_ = str;
+ return this;
+ }
+
+ ProcessorBuilder* custom_decoder_factory(
+ std::shared_ptr<DecoderFactory> custom_decoder_factory) override {
+ decoder_factory_ = std::move(custom_decoder_factory);
+ return this;
+ }
+
+ ProcessorBuilder* set_default_buffer_size(std::size_t size) override {
+ default_buffer_size_ = size;
+ return this;
+ }
+
+ ProcessorBuilder* set_max_buffer_size(std::size_t size) override {
+ max_buffer_size_ = size;
+ return this;
+ }
+
+ std::unique_ptr<Processor> build(
+ std::shared_ptr<Delegate> delegate) const override {
+ return create_processor(std::move(delegate),
+ decoder_factory_,
+ force_encoding_,
+ default_buffer_size_,
+ max_buffer_size_);
+ }
+
+ ProcessorBuilderImpl() = default;
+
+ private:
+ std::shared_ptr<DecoderFactory> decoder_factory_;
+ std::optional<std::string> force_encoding_;
+ std::optional<std::size_t> default_buffer_size_;
+ std::optional<std::size_t> max_buffer_size_;
+};
+
+} // namespace
+
+std::unique_ptr<ProcessorBuilder> ProcessorBuilder::create() {
+ return std::make_unique<ProcessorBuilderImpl>();
+}
+
+} // namespace sax
+} // namespace modxml
diff --git a/sax/src/utils.cc b/sax/src/utils.cc
new file mode 100644
index 0000000..f0366d5
--- /dev/null
+++ b/sax/src/utils.cc
@@ -0,0 +1,70 @@
+#include "utils.hh"
+
+#include "decoder.hh"
+#include "sax_decoder.hh"
+#include "sax_decoder_factory.hh"
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+std::string cleanup_encoding(std::string const& str) {
+ std::string ret;
+ ret.reserve(str.size());
+ for (auto c : str) {
+ if (c >= 'A' && c <= 'Z') {
+ ret.push_back(c | 0x20);
+ } else if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
+ ret.push_back(c);
+ } else if (c == '.' || c == '_' || c == '-') {
+ ret.push_back('-');
+ }
+ }
+ return ret;
+}
+
+} // namespace
+
+// Names inspired by:
+// https://www.iana.org/assignments/character-sets/character-sets.xhtml
+std::unique_ptr<Decoder> pick_decoder_for_encoding(
+ std::string const& encoding, DecoderFactory* factory) {
+ auto clean_enc = cleanup_encoding(encoding);
+ if (clean_enc == "utf-8" || clean_enc == "utf8") {
+ return create_utf8_decoder();
+ }
+ if (clean_enc == "utf-16" || clean_enc == "utf16") {
+ return create_utf16_decoder();
+ }
+ if (clean_enc == "utf-16be" || clean_enc == "utf16be") {
+ return create_utf16be_decoder();
+ }
+ if (clean_enc == "utf-16le" || clean_enc == "utf16le") {
+ return create_utf16le_decoder();
+ }
+ if (clean_enc == "utf-32" || clean_enc == "utf32") {
+ return create_utf32_decoder();
+ }
+ if (clean_enc == "utf-32be" || clean_enc == "utf32be") {
+ return create_utf32be_decoder();
+ }
+ if (clean_enc == "utf-32le" || clean_enc == "utf32le") {
+ return create_utf32le_decoder();
+ }
+ if (clean_enc == "ascii" || clean_enc == "us-ascii" ||
+ clean_enc == "usascii" || clean_enc == "iso-ir-6" ||
+ clean_enc == "ansi-x3-4-1968" || clean_enc == "ansi-x3-4-1986" ||
+ clean_enc == "iso-646-irv1991" || clean_enc == "iso646-us" ||
+ clean_enc == "us" || clean_enc == "ibm367" || clean_enc == "cp367") {
+ return create_ascii_decoder();
+ }
+ if (factory) {
+ return factory->create(encoding);
+ }
+ return nullptr;
+}
+
+} // namespace sax
+
+} // namespace modxml
diff --git a/sax/src/utils.hh b/sax/src/utils.hh
new file mode 100644
index 0000000..206d003
--- /dev/null
+++ b/sax/src/utils.hh
@@ -0,0 +1,22 @@
+#ifndef UTILS_HH
+#define UTILS_HH
+
+#include "macros.hh"
+
+#include <memory>
+#include <string>
+
+namespace modxml {
+namespace sax {
+
+class Decoder;
+class DecoderFactory;
+
+std::unique_ptr<Decoder> HIDDEN pick_decoder_for_encoding(
+ std::string const& encoding,
+ DecoderFactory* factory);
+
+} // namespace sax
+} // namespace modxml
+
+#endif // UTILS_HH