From fc4547b412e28164af1bf8981234c6af959ccc0b Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Tue, 13 Jun 2023 10:07:16 +0200 Subject: WIP --- sax/inc/sax_decoder.hh | 57 +++++++ sax/inc/sax_decoder_factory.hh | 35 +++++ sax/inc/sax_delegate.hh | 22 +++ sax/inc/sax_error.hh | 36 +++++ sax/inc/sax_processor.hh | 37 +++++ sax/inc/sax_processor_builder.hh | 82 ++++++++++ sax/meson.build | 22 +++ sax/src/decoder.cc | 321 +++++++++++++++++++++++++++++++++++++++ sax/src/decoder.hh | 33 ++++ sax/src/processor.hh | 27 ++++ sax/src/sax_processor.cc | 145 ++++++++++++++++++ sax/src/sax_processor_builder.cc | 62 ++++++++ sax/src/utils.cc | 70 +++++++++ sax/src/utils.hh | 22 +++ 14 files changed, 971 insertions(+) create mode 100644 sax/inc/sax_decoder.hh create mode 100644 sax/inc/sax_decoder_factory.hh create mode 100644 sax/inc/sax_delegate.hh create mode 100644 sax/inc/sax_error.hh create mode 100644 sax/inc/sax_processor.hh create mode 100644 sax/inc/sax_processor_builder.hh create mode 100644 sax/meson.build create mode 100644 sax/src/decoder.cc create mode 100644 sax/src/decoder.hh create mode 100644 sax/src/processor.hh create mode 100644 sax/src/sax_processor.cc create mode 100644 sax/src/sax_processor_builder.cc create mode 100644 sax/src/utils.cc create mode 100644 sax/src/utils.hh (limited to 'sax') diff --git a/sax/inc/sax_decoder.hh b/sax/inc/sax_decoder.hh new file mode 100644 index 0000000..40a56c9 --- /dev/null +++ b/sax/inc/sax_decoder.hh @@ -0,0 +1,57 @@ +#ifndef SAX_DECODER_HH +#define SAX_DECODER_HH + +#include +#include +#include + +namespace modxml { +namespace sax { + +/** + * Decoder returned by DecoderFactory. Used by Processor to turn bytes into + * unicode characters. + */ +class Decoder { + public: + virtual ~Decoder() = default; + + enum class State { + GOOD = 0, + // too little data was given to advance + NEED_MORE, + // invalid data was given to advance + INVALID, + }; + + /** + * Decode as many code points as possible from in (start at in_offset) and + * write them to out (start at out_offset) as UTF-8. + * All written code points must be valid per Unicode, so inside the + * range U+0 to U+10FFFF and not a surrogate pair (U+D800-U+DFFF). + * No partial output, only write to out if the whole UTF-8 sequence is + * going to fit. + * The is always at least 4 bytes available (out.size() - out_offset) when + * called. + * Advance in_offset for data consumed. + * Advance out_offset for code points written. Do NOT write past out.size(). + * Do NOT resize out. + * If at least one code point is decoded and written to out, return GOOD. + * If it is not possible to decode a single code point, in_offset and + * out_offset should not be advanced and something other than GOOD returned. + * Do not keep any references to any of the parameters after returning, next + * advance() call will point to the following bytes, but all parameters + * may have changed as they are subject to the buffer implementations of the + * Processor. + */ + virtual State decode(std::string_view in, std::size_t& in_offset, + std::string& out, std::size_t& out_offset) = 0; + + protected: + Decoder() = default; +}; + +} // namespace sax +} // namespace modxml + +#endif // SAX_DECODER_HH diff --git a/sax/inc/sax_decoder_factory.hh b/sax/inc/sax_decoder_factory.hh new file mode 100644 index 0000000..80f1af3 --- /dev/null +++ b/sax/inc/sax_decoder_factory.hh @@ -0,0 +1,35 @@ +#ifndef SAX_DECODER_FACTORY_HH +#define SAX_DECODER_FACTORY_HH + +#include +#include + +namespace modxml { +namespace sax { + +class Decoder; + +/** + * Factory for decoders. You can give one to ProcessBuilder. + */ +class DecoderFactory { + public: + virtual ~DecoderFactory() = default; + + /** + * If encoding is supported, return a decoder for that encoding. + * Return nullptr if not supported and Processor will return + * UNKNOWN_ENCODING error. + * Note that encoding value isn't cleaned up or validated in any way, it is + * reported EXACTLY as found (even if not valid per XML spec). + */ + virtual std::unique_ptr create(std::string const& encoding) = 0; + + protected: + DecoderFactory() = default; +}; + +} // namespace sax +} // namespace modxml + +#endif // SAX_DECODER_FACTORY_HH diff --git a/sax/inc/sax_delegate.hh b/sax/inc/sax_delegate.hh new file mode 100644 index 0000000..ba63e72 --- /dev/null +++ b/sax/inc/sax_delegate.hh @@ -0,0 +1,22 @@ +#ifndef MODXML_SAX_DELEGATE_HH +#define MODXML_SAX_DELEGATE_HH + +namespace modxml { +namespace sax { + +/** + * Delegate for processor. + * Implement to handle events. + */ +class Delegate { + public: + virtual ~Delegate() = default; + + protected: + Delegate() = default; +}; + +} // namespace sax +} // namespace modxml + +#endif // MODXML_SAX_DELEGATE_HH diff --git a/sax/inc/sax_error.hh b/sax/inc/sax_error.hh new file mode 100644 index 0000000..748f995 --- /dev/null +++ b/sax/inc/sax_error.hh @@ -0,0 +1,36 @@ +#ifndef MODXML_SAX_ERROR_HH +#define MODXML_SAX_ERROR_HH + +namespace modxml { +namespace sax { + +enum class Error { + /** + * The XML spec has a list of characters that are never allowed in a document. + */ + INVALID_CHAR, + /** + * If the document encoding is unsupported or unkown. + */ + UNKNOWN_ENCODING, + /** + * If the document is incomplete. The is one of the few recoverable errors, + * if you call the processor with more data it will continue. + */ + INCOMPLETE, + /** + * A entity in the document exeeded max buffer size (either set by + * ProcessBuilder or the default 10 MiB). + */ + MAX_MEMORY_EXCEEDED, + /** + * A memory allocation failed. Note that this doesn't protect against + * usage of overallocated memory. + */ + OUT_OF_MEMORY, +}; + +} // namespace sax +} // namespace modxml + +#endif // MODXML_SAX_ERROR_HH diff --git a/sax/inc/sax_processor.hh b/sax/inc/sax_processor.hh new file mode 100644 index 0000000..7ca32f7 --- /dev/null +++ b/sax/inc/sax_processor.hh @@ -0,0 +1,37 @@ +#ifndef MODXML_SAX_PROCESSOR_HH +#define MODXML_SAX_PROCESSOR_HH + +#include + +namespace modxml { +namespace sax { + +class Delegate; + +/** + * The XML processor, or parser if you like that term better. + * Feed it data and the processor will give the delegate calls with events or + * possibly errors. + */ +class Processor { + public: + virtual ~Processor() = default; + + /** + * Construct a Processor. Same as creating a ProcessorBuilder + * and not changing any options and just calling build. + */ + static std::unique_ptr create(std::shared_ptr delegate); + + protected: + Processor() = default; + + private: + Processor(Processor const&) = delete; + Processor& operator=(Processor const&) = delete; +}; + +} // namespace sax +} // namespace modxml + +#endif // MODXML_SAX_PROCESSOR_HH diff --git a/sax/inc/sax_processor_builder.hh b/sax/inc/sax_processor_builder.hh new file mode 100644 index 0000000..070fbbf --- /dev/null +++ b/sax/inc/sax_processor_builder.hh @@ -0,0 +1,82 @@ +#ifndef MODXML_SAX_PROCESSOR_BUILDER_HH +#define MODXML_SAX_PROCESSOR_BUILDER_HH + +#include +#include + +namespace modxml { +namespace sax { + +class DecoderFactory; +class Delegate; +class Processor; + +/** + * Used to construct Processor's with options set if needed. + */ +class ProcessorBuilder { + public: + virtual ~ProcessorBuilder() = default; + + /** + * Construct a ProcessorBuilder. All options are set to default. + */ + static std::unique_ptr create(); + + /** + * If you know the encoding of the data sent to the processor set it here, + * this will stop the processor from trying to autodetect and will ignore + * encoding in any xml declaration if found. + * If the encoding is unsupported/unknown the processor will fail with + * an error indicating this, same as if it read a xml declaration with + * an unsupported or unknown encoding. + */ + virtual ProcessorBuilder* force_encoding(std::string const& str) = 0; + + /** + * Set a decoder factory for encodings not supported by library. + * Library only calls this for encodings it doesn't support itself. + * Library supports UTF-8, UTF-16, UTF-32 and US-ASCII. + * If you want to force the decoder factory to be used, force a custom + * encoding with force_encoding above. + */ + virtual ProcessorBuilder* custom_decoder_factory( + std::shared_ptr custom_decoder_factory) = 0; + + /** + * Set the default buffer size the processor should use. + * If you give a too small buffer size (such as zero) it will be ignored + * and a implementation specific minimum will be used instead. + * This is meant as a possible optimization and can be completely ignored. + * Note that the processor will allocate more data if it needed. + */ + virtual ProcessorBuilder* set_default_buffer_size(std::size_t size) = 0; + + /** + * Set the max buffer size the processor should use. + * If you have memory constraints this will block the processing of CDATA, + * or other entities from allocating more than the given size. + * Default is 10MiB. + */ + virtual ProcessorBuilder* set_max_buffer_size(std::size_t size) = 0; + + /** + * Call to construct a Processor with the options setup in this builder, + * using the delegate given as parameter. + * May be called multiple times, will create an unique Processor each time. + */ + virtual std::unique_ptr build( + std::shared_ptr delegate) const = 0; + + protected: + ProcessorBuilder() = default; + + private: + ProcessorBuilder(ProcessorBuilder const&) = delete; + ProcessorBuilder& operator=(ProcessorBuilder const&) = delete; +}; + +} // namespace sax +} // namespace modxml + +#endif // MODXML_SAX_PROCESSOR_BUILDER_HH diff --git a/sax/meson.build b/sax/meson.build new file mode 100644 index 0000000..ccbdef4 --- /dev/null +++ b/sax/meson.build @@ -0,0 +1,22 @@ +deps = [ + base_dep, + utf_dep, +] + +inc = include_directories('inc') +lib = shared_library( + 'modxmlsax', + 'src/decoder.cc', + 'src/sax_processor.cc', + 'src/sax_processor_builder.cc', + 'src/utils.cc', + dependencies: deps, + include_directories: inc, + install: true, +) + +sax_dep = declare_dependency( + dependencies: deps, + include_directories: inc, + link_with: lib, +) diff --git a/sax/src/decoder.cc b/sax/src/decoder.cc new file mode 100644 index 0000000..30b1735 --- /dev/null +++ b/sax/src/decoder.cc @@ -0,0 +1,321 @@ +#include "decoder.hh" + +#include "macros.hh" +#include "sax_decoder.hh" +#include "utf16.hh" +#include "utf32.hh" +#include "utf8.hh" +#include "utf_error.hh" + +namespace modxml { +namespace sax { + +namespace { + +class UtfDecoder : public Decoder { + public: + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + if (bom_ == -1) UNLIKELY { + std::size_t tmp = in_offset; + uint32_t ret = read(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + if (ret == 0xfeff) { + // To allow offset to advance and to return, we need to + // read at least one more character completely. + ret = read(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + bom_ = 1; + } else { + bom_ = 0; + } + in_offset = tmp; + out[out_offset++] = ret; + if (out_offset == out_size) + return State::GOOD; + } + + do { + uint32_t ret = read(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + return State::GOOD; + } + + protected: + UtfDecoder() = default; + + virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0; + + private: + int8_t bom_{-1}; +}; + +class Utf8Decoder : public UtfDecoder { + public: + Utf8Decoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read8(data, offset); + } +}; + +class Utf16BeDecoder : public UtfDecoder { + public: + Utf16BeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read16be(data, offset); + } +}; + +class Utf16LeDecoder : public UtfDecoder { + public: + Utf16LeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read16le(data, offset); + } +}; + +class Utf32BeDecoder : public UtfDecoder { + public: + Utf32BeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read32be(data, offset); + } +}; + +class Utf32LeDecoder : public UtfDecoder { + public: + Utf32LeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read32le(data, offset); + } +}; + +class Utf16Decoder : public Decoder { + public: + Utf16Decoder() = default; + + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + if (endian_ == -1) UNLIKELY { + std::size_t tmp = in_offset; + uint32_t ret = utf::read16be(in, tmp); + int8_t endian; + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + if (ret == 0xfeff) { + endian = 1; // Big endian + } else if (ret == 0xfffe) { + endian = 0; // Little endian + } else { + return State::INVALID; + } + + // To allow offset to advance and to return, we need to + // read at least one more character completely. + ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + + endian_ = endian; + in_offset = tmp; + out[out_offset++] = ret; + if (out_offset == out_size) + return State::GOOD; + } + + if (endian_ == 1) { + do { + uint32_t ret = utf::read16be(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } else { + do { + uint32_t ret = utf::read16le(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } + return State::GOOD; + } + + private: + int8_t endian_{-1}; +}; + +class Utf32Decoder : public Decoder { + public: + Utf32Decoder() = default; + + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + if (endian_ == -1) UNLIKELY { + std::size_t tmp = in_offset; + uint32_t ret = utf::read32be(in, tmp); + int8_t endian; + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + tmp = in_offset; + ret = utf::read32le(in, tmp); + if (ret == 0xfeff) { + endian = 0; // Little endian + } else { + return State::INVALID; + } + } else if (ret == 0xfeff) { + endian = 1; // Big endian + } else { + return State::INVALID; + } + + // To allow offset to advance and to return, we need to + // read the next character completely. + ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + + endian_ = endian; + in_offset = tmp; + out[out_offset++] = ret; + if (out_offset == out_size) + return State::GOOD; + } + + if (endian_ == 1) { + do { + uint32_t ret = utf::read32be(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } else { + do { + uint32_t ret = utf::read32le(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } + return State::GOOD; + } + + private: + int8_t endian_{-1}; +}; + +class AsciiDecoder : public Decoder { + public: + AsciiDecoder() = default; + + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + do { + if (in_offset == in.size()) + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + if (in[in_offset] & 0x80) + return out_offset > out_start ? State::GOOD : State::INVALID; + out[out_offset++] = in[in_offset++]; + } while (out_offset < out_size); + return State::GOOD; + } +}; + +} // namespace + +std::unique_ptr create_utf8_decoder() { + return std::make_unique(); +} + +std::unique_ptr create_utf16be_decoder() { + return std::make_unique(); +} + +std::unique_ptr create_utf16le_decoder() { + return std::make_unique(); +} + +std::unique_ptr create_utf32be_decoder() { + return std::make_unique(); +} + +std::unique_ptr create_utf32le_decoder() { + return std::make_unique(); +} + +std::unique_ptr create_utf16_decoder() { + return std::make_unique(); +} + +std::unique_ptr create_utf32_decoder() { + return std::make_unique(); +} + +std::unique_ptr create_ascii_decoder() { + return std::make_unique(); +} + +} // namespace sax +} // namespace modxml + diff --git a/sax/src/decoder.hh b/sax/src/decoder.hh new file mode 100644 index 0000000..bd2a99a --- /dev/null +++ b/sax/src/decoder.hh @@ -0,0 +1,33 @@ +#ifndef DECODER_HH +#define DECODER_HH + +#include "macros.hh" + +#include + +namespace modxml { +namespace sax { + +class Decoder; + +// UTF-8 with optional BOM +std::unique_ptr HIDDEN create_utf8_decoder(); +// UTF-16 with BOM +std::unique_ptr HIDDEN create_utf16_decoder(); +// UTF-16BE with optional BOM +std::unique_ptr HIDDEN create_utf16be_decoder(); +// UTF-16LE with optional BOM +std::unique_ptr HIDDEN create_utf16le_decoder(); +// UTF-32 with BOM +std::unique_ptr HIDDEN create_utf32_decoder(); +// UTF-32BE with optional BOM +std::unique_ptr HIDDEN create_utf32be_decoder(); +// UTF-32LE with optional BOM +std::unique_ptr HIDDEN create_utf32le_decoder(); +// US-ASCII +std::unique_ptr HIDDEN create_ascii_decoder(); + +} // namespace sax +} // namespace modxml + +#endif // DECODER_HH diff --git a/sax/src/processor.hh b/sax/src/processor.hh new file mode 100644 index 0000000..4a2de29 --- /dev/null +++ b/sax/src/processor.hh @@ -0,0 +1,27 @@ +#ifndef PROCESSOR_HH +#define PROCESSOR_HH + +#include "macros.hh" + +#include +#include +#include + +namespace modxml { +namespace sax { + +class DecoderFactory; +class Delegate; +class Processor; + +std::unique_ptr HIDDEN create_processor( + std::shared_ptr delegate, + std::shared_ptr decoder_factory, + std::optional force_encoding, + std::optional default_buffer_size, + std::optional max_buffer_size); + +} // namespace sax +} // namespace modxml + +#endif // PROCESSOR_HH diff --git a/sax/src/sax_processor.cc b/sax/src/sax_processor.cc new file mode 100644 index 0000000..ea9f753 --- /dev/null +++ b/sax/src/sax_processor.cc @@ -0,0 +1,145 @@ +#include "sax_processor.hh" + +#include "sax_decoder.hh" +#include "processor.hh" +#include "utils.hh" + +#include +#include +#include + +namespace modxml { +namespace sax { + +namespace { + +// 2.2 Characters +// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + +inline bool valid_char(uint32_t c) { + // Assume valid unicode (U+0 - U+10ffff except surrogate blocks) + if (c >= 0x20 && c <= 0xfffd) + return true; + if (c == 0x9 || c == 0xa || c == 0xd) + return true; + return c >= 0x10000; +} + +// 2.3 Common Syntactic Constructs +// [3] S ::= (#x20 | #x9 | #xD | #xA)+ + +inline bool is_ws(uint32_t c) { + // Assume we already checked for valid_char. + return c <= 0x20; +} + +// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] +// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] + +inline bool is_namestartchar(uint32_t c) { + if (c < 0x41 /* A */) + return c == 0x3a /* : */; + if (c <= 0x5a /* Z */) + return true; + if (c < 0x61 /* a */) + return c == 0x5f /* _ */; + if (c <= 0x7a /* z */) + return true; + if (c < 0xc0) + return false; + if (c < 0x300) + return c != 0xd7 && c != 0xf7; + if (c > 0x37d && c < 0x37f) + return false; + if (c > 0x1fff && c < 0x200c) + return false; + if (c > 0x200d && c < 0x2070) + return false; + if (c > 0x218f && c < 0x2c00) + return false; + if (c > 0x2fef && c < 0x3001) + return false; + // Already valid_char so don't check for surrogate pair here. + if (c > 0xdfff && c < 0xf900) + return false; + if (c > 0xfdcf && c < 0xfdf0) + return false; + if (c > 0xfffd && c < 0x10000) + return false; + return true; +} + +inline bool is_namechar(uint32_t c) { + return is_namestartchar(c) || (c >= 0x2d /* - */ && c <= 0x2e /* . */) || + (c >= 0x30 /* 0 */ && c <= 0x39 /* 9 */) || (c == 0xb7) || + (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040); +} + +/* [5] Name ::= NameStartChar (NameChar)* +[6] Names ::= Name (#x20 Name)* +[7] Nmtoken ::= (NameChar)+ +[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* +*/ + +class ProcessorImpl : public Processor { + public: + ProcessorImpl(std::shared_ptr delegate, + std::shared_ptr decoder_factory, + std::unique_ptr decoder, + std::size_t default_buffer_size, + std::size_t max_buffer_size) + : delegate_(std::move(delegate)), + decoder_factory_(std::move(decoder_factory)), + decoder_(std::move(decoder)), + default_buffer_size_(default_buffer_size), + max_buffer_size_(max_buffer_size) {} + + private: + std::shared_ptr delegate_; + std::shared_ptr decoder_factory_; + std::unique_ptr decoder_; + std::size_t default_buffer_size_; + std::size_t max_buffer_size_; +}; + +} // namespace + +std::unique_ptr create_processor( + std::shared_ptr delegate, + std::shared_ptr decoder_factory, + std::optional force_encoding, + std::optional opt_default_buffer_size, + std::optional opt_max_buffer_size) { + + std::unique_ptr decoder; + if (force_encoding.has_value()) { + decoder = pick_decoder_for_encoding(force_encoding.value(), + decoder_factory.get()); + } + + std::size_t default_buffer_size = 8192; + if (opt_default_buffer_size.has_value()) + default_buffer_size = std::max(static_cast(128), + opt_default_buffer_size.value()); + // This value is documented in public headers. Do NOT change. + std::size_t max_buffer_size = 10 * 1024 * 1024; + // No validation for user set value. If it is too small MAX_MEMORY_EXCEEDED + // error will be thrown. If it is too large we will get OUT_OF_MEMORY or + // crash depending on platform. + if (opt_max_buffer_size.has_value()) + max_buffer_size = opt_max_buffer_size.value(); + + return std::make_unique(std::move(delegate), + std::move(decoder_factory), + std::move(decoder), + default_buffer_size, + max_buffer_size); +} + +std::unique_ptr create(std::shared_ptr delegate) { + return create_processor(std::move(delegate), nullptr, + std::nullopt, std::nullopt, std::nullopt); +} + +} // namespace sax +} // namespace modxml diff --git a/sax/src/sax_processor_builder.cc b/sax/src/sax_processor_builder.cc new file mode 100644 index 0000000..8817099 --- /dev/null +++ b/sax/src/sax_processor_builder.cc @@ -0,0 +1,62 @@ +#include "sax_processor_builder.hh" + +#include "processor.hh" +#include "sax_processor.hh" + +#include +#include + +namespace modxml { +namespace sax { + +namespace { + +class ProcessorBuilderImpl : public ProcessorBuilder { + public: + ProcessorBuilder* force_encoding(std::string const& str) override { + force_encoding_ = str; + return this; + } + + ProcessorBuilder* custom_decoder_factory( + std::shared_ptr custom_decoder_factory) override { + decoder_factory_ = std::move(custom_decoder_factory); + return this; + } + + ProcessorBuilder* set_default_buffer_size(std::size_t size) override { + default_buffer_size_ = size; + return this; + } + + ProcessorBuilder* set_max_buffer_size(std::size_t size) override { + max_buffer_size_ = size; + return this; + } + + std::unique_ptr build( + std::shared_ptr delegate) const override { + return create_processor(std::move(delegate), + decoder_factory_, + force_encoding_, + default_buffer_size_, + max_buffer_size_); + } + + ProcessorBuilderImpl() = default; + + private: + std::shared_ptr decoder_factory_; + std::optional force_encoding_; + std::optional default_buffer_size_; + std::optional max_buffer_size_; +}; + +} // namespace + +std::unique_ptr ProcessorBuilder::create() { + return std::make_unique(); +} + +} // namespace sax +} // namespace modxml diff --git a/sax/src/utils.cc b/sax/src/utils.cc new file mode 100644 index 0000000..f0366d5 --- /dev/null +++ b/sax/src/utils.cc @@ -0,0 +1,70 @@ +#include "utils.hh" + +#include "decoder.hh" +#include "sax_decoder.hh" +#include "sax_decoder_factory.hh" + +namespace modxml { +namespace sax { + +namespace { + +std::string cleanup_encoding(std::string const& str) { + std::string ret; + ret.reserve(str.size()); + for (auto c : str) { + if (c >= 'A' && c <= 'Z') { + ret.push_back(c | 0x20); + } else if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) { + ret.push_back(c); + } else if (c == '.' || c == '_' || c == '-') { + ret.push_back('-'); + } + } + return ret; +} + +} // namespace + +// Names inspired by: +// https://www.iana.org/assignments/character-sets/character-sets.xhtml +std::unique_ptr pick_decoder_for_encoding( + std::string const& encoding, DecoderFactory* factory) { + auto clean_enc = cleanup_encoding(encoding); + if (clean_enc == "utf-8" || clean_enc == "utf8") { + return create_utf8_decoder(); + } + if (clean_enc == "utf-16" || clean_enc == "utf16") { + return create_utf16_decoder(); + } + if (clean_enc == "utf-16be" || clean_enc == "utf16be") { + return create_utf16be_decoder(); + } + if (clean_enc == "utf-16le" || clean_enc == "utf16le") { + return create_utf16le_decoder(); + } + if (clean_enc == "utf-32" || clean_enc == "utf32") { + return create_utf32_decoder(); + } + if (clean_enc == "utf-32be" || clean_enc == "utf32be") { + return create_utf32be_decoder(); + } + if (clean_enc == "utf-32le" || clean_enc == "utf32le") { + return create_utf32le_decoder(); + } + if (clean_enc == "ascii" || clean_enc == "us-ascii" || + clean_enc == "usascii" || clean_enc == "iso-ir-6" || + clean_enc == "ansi-x3-4-1968" || clean_enc == "ansi-x3-4-1986" || + clean_enc == "iso-646-irv1991" || clean_enc == "iso646-us" || + clean_enc == "us" || clean_enc == "ibm367" || clean_enc == "cp367") { + return create_ascii_decoder(); + } + if (factory) { + return factory->create(encoding); + } + return nullptr; +} + +} // namespace sax + +} // namespace modxml diff --git a/sax/src/utils.hh b/sax/src/utils.hh new file mode 100644 index 0000000..206d003 --- /dev/null +++ b/sax/src/utils.hh @@ -0,0 +1,22 @@ +#ifndef UTILS_HH +#define UTILS_HH + +#include "macros.hh" + +#include +#include + +namespace modxml { +namespace sax { + +class Decoder; +class DecoderFactory; + +std::unique_ptr HIDDEN pick_decoder_for_encoding( + std::string const& encoding, + DecoderFactory* factory); + +} // namespace sax +} // namespace modxml + +#endif // UTILS_HH -- cgit v1.2.3-70-g09d2