summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@spawned.biz>2023-06-13 10:07:16 +0200
committerJoel Klinghed <the_jk@spawned.biz>2023-06-13 10:07:16 +0200
commitfc4547b412e28164af1bf8981234c6af959ccc0b (patch)
tree061253e7a4f6abaca282223b36d10f0bed8cad23
WIP
-rw-r--r--.dir-locals.el17
-rw-r--r--.gitignore3
-rw-r--r--base/inc/macros.hh16
-rw-r--r--base/meson.build19
-rw-r--r--meson.build30
-rw-r--r--sax/inc/sax_decoder.hh57
-rw-r--r--sax/inc/sax_decoder_factory.hh35
-rw-r--r--sax/inc/sax_delegate.hh22
-rw-r--r--sax/inc/sax_error.hh36
-rw-r--r--sax/inc/sax_processor.hh37
-rw-r--r--sax/inc/sax_processor_builder.hh82
-rw-r--r--sax/meson.build22
-rw-r--r--sax/src/decoder.cc321
-rw-r--r--sax/src/decoder.hh33
-rw-r--r--sax/src/processor.hh27
-rw-r--r--sax/src/sax_processor.cc145
-rw-r--r--sax/src/sax_processor_builder.cc62
-rw-r--r--sax/src/utils.cc70
-rw-r--r--sax/src/utils.hh22
-rw-r--r--scripts/iwyu.sh11
-rw-r--r--subprojects/.gitignore2
-rw-r--r--subprojects/gtest.wrap15
-rw-r--r--utf/inc/utf16.hh31
-rw-r--r--utf/inc/utf32.hh29
-rw-r--r--utf/inc/utf8.hh22
-rw-r--r--utf/inc/utf_error.hh13
-rw-r--r--utf/meson.build38
-rw-r--r--utf/src/utf16.cc67
-rw-r--r--utf/src/utf32.cc43
-rw-r--r--utf/src/utf8.cc68
-rw-r--r--utf/tst/test_utf16.cc157
-rw-r--r--utf/tst/test_utf32.cc145
-rw-r--r--utf/tst/test_utf8.cc188
33 files changed, 1885 insertions, 0 deletions
diff --git a/.dir-locals.el b/.dir-locals.el
new file mode 100644
index 0000000..484f453
--- /dev/null
+++ b/.dir-locals.el
@@ -0,0 +1,17 @@
+;;; Directory Local Variables
+;;; For more information see (info "(emacs) Directory Variables")
+
+((c-mode
+ .
+ ((eval .
+ (let ((project-path
+ (locate-dominating-file default-directory ".dir-locals.el")))
+ (setq-local flycheck-clangcheck-build-path
+ (concat project-path "build"))))))
+ (c++-mode
+ .
+ ((eval .
+ (let ((project-path
+ (locate-dominating-file default-directory ".dir-locals.el")))
+ (setq-local flycheck-clangcheck-build-path
+ (concat project-path "build")))))))
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fd279fc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+/build/
+/build-rel/
+/build-cov/
diff --git a/base/inc/macros.hh b/base/inc/macros.hh
new file mode 100644
index 0000000..6d88669
--- /dev/null
+++ b/base/inc/macros.hh
@@ -0,0 +1,16 @@
+#ifndef BASE_MACROS_HH
+#define BASE_MACROS_HH
+
+#if defined(HAVE_ATTRIBUTE_VISIBILITY_HIDDEN)
+# define HIDDEN __attribute__((visibility ("hidden")))
+#else
+# define HIDDEN
+#endif
+
+#if defined(HAVE_ATTRIBUTE_UNLIKELY)
+# define UNLIKELY [[unlikely]]
+#else
+# define UNLIKELY
+#endif
+
+#endif // BASE_MACROS_HH
diff --git a/base/meson.build b/base/meson.build
new file mode 100644
index 0000000..71faace
--- /dev/null
+++ b/base/meson.build
@@ -0,0 +1,19 @@
+cpp = meson.get_compiler('cpp')
+cpp_flags = []
+if cpp.has_function_attribute('visibility:hidden')
+ cpp_flags += '-DHAVE_ATTRIBUTE_VISIBILITY_HIDDEN'
+endif
+if cpp.compiles('''int foo() {
+ [[unlikely]]
+ return 0;
+}''', name: 'C++20 unlikely attribute')
+ cpp_flags += '-DHAVE_ATTRIBUTE_UNLIKELY'
+ cpp_flags += '-Wno-c++20-attribute-extensions'
+endif
+
+inc = include_directories('inc')
+
+base_dep = declare_dependency(
+ compile_args: cpp_flags,
+ include_directories: inc,
+)
diff --git a/meson.build b/meson.build
new file mode 100644
index 0000000..2d571dc
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,30 @@
+project(
+ 'libmodxml', 'cpp',
+ version : '0.1',
+ meson_version: '>= 0.58',
+ default_options : [
+ 'warning_level=3',
+ 'cpp_std=c++17',
+ 'cpp_rtti=false',
+ 'cpp_eh=none',
+ 'b_ndebug=if-release',
+ ],
+)
+
+gtest_dep = dependency(
+ 'gtest',
+ version: '>= 1.10.0',
+ main: true,
+ fallback: ['gtest', 'gtest_main_dep'])
+
+gmock_dep = dependency(
+ 'gmock',
+ version: '>= 1.10.0',
+ main: false,
+ fallback: ['gtest', 'gmock_dep'])
+
+subdir('base')
+subdir('utf')
+subdir('sax')
+
+run_target('iwyu', command: 'scripts/iwyu.sh')
diff --git a/sax/inc/sax_decoder.hh b/sax/inc/sax_decoder.hh
new file mode 100644
index 0000000..40a56c9
--- /dev/null
+++ b/sax/inc/sax_decoder.hh
@@ -0,0 +1,57 @@
+#ifndef SAX_DECODER_HH
+#define SAX_DECODER_HH
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+namespace modxml {
+namespace sax {
+
+/**
+ * Decoder returned by DecoderFactory. Used by Processor to turn bytes into
+ * unicode characters.
+ */
+class Decoder {
+ public:
+ virtual ~Decoder() = default;
+
+ enum class State {
+ GOOD = 0,
+ // too little data was given to advance
+ NEED_MORE,
+ // invalid data was given to advance
+ INVALID,
+ };
+
+ /**
+ * Decode as many code points as possible from in (start at in_offset) and
+ * write them to out (start at out_offset) as UTF-8.
+ * All written code points must be valid per Unicode, so inside the
+ * range U+0 to U+10FFFF and not a surrogate pair (U+D800-U+DFFF).
+ * No partial output, only write to out if the whole UTF-8 sequence is
+ * going to fit.
+ * The is always at least 4 bytes available (out.size() - out_offset) when
+ * called.
+ * Advance in_offset for data consumed.
+ * Advance out_offset for code points written. Do NOT write past out.size().
+ * Do NOT resize out.
+ * If at least one code point is decoded and written to out, return GOOD.
+ * If it is not possible to decode a single code point, in_offset and
+ * out_offset should not be advanced and something other than GOOD returned.
+ * Do not keep any references to any of the parameters after returning, next
+ * advance() call will point to the following bytes, but all parameters
+ * may have changed as they are subject to the buffer implementations of the
+ * Processor.
+ */
+ virtual State decode(std::string_view in, std::size_t& in_offset,
+ std::string& out, std::size_t& out_offset) = 0;
+
+ protected:
+ Decoder() = default;
+};
+
+} // namespace sax
+} // namespace modxml
+
+#endif // SAX_DECODER_HH
diff --git a/sax/inc/sax_decoder_factory.hh b/sax/inc/sax_decoder_factory.hh
new file mode 100644
index 0000000..80f1af3
--- /dev/null
+++ b/sax/inc/sax_decoder_factory.hh
@@ -0,0 +1,35 @@
+#ifndef SAX_DECODER_FACTORY_HH
+#define SAX_DECODER_FACTORY_HH
+
+#include <memory>
+#include <string>
+
+namespace modxml {
+namespace sax {
+
+class Decoder;
+
+/**
+ * Factory for decoders. You can give one to ProcessBuilder.
+ */
+class DecoderFactory {
+ public:
+ virtual ~DecoderFactory() = default;
+
+ /**
+ * If encoding is supported, return a decoder for that encoding.
+ * Return nullptr if not supported and Processor will return
+ * UNKNOWN_ENCODING error.
+ * Note that encoding value isn't cleaned up or validated in any way, it is
+ * reported EXACTLY as found (even if not valid per XML spec).
+ */
+ virtual std::unique_ptr<Decoder> create(std::string const& encoding) = 0;
+
+ protected:
+ DecoderFactory() = default;
+};
+
+} // namespace sax
+} // namespace modxml
+
+#endif // SAX_DECODER_FACTORY_HH
diff --git a/sax/inc/sax_delegate.hh b/sax/inc/sax_delegate.hh
new file mode 100644
index 0000000..ba63e72
--- /dev/null
+++ b/sax/inc/sax_delegate.hh
@@ -0,0 +1,22 @@
+#ifndef MODXML_SAX_DELEGATE_HH
+#define MODXML_SAX_DELEGATE_HH
+
+namespace modxml {
+namespace sax {
+
+/**
+ * Delegate for processor.
+ * Implement to handle events.
+ */
+class Delegate {
+ public:
+ virtual ~Delegate() = default;
+
+ protected:
+ Delegate() = default;
+};
+
+} // namespace sax
+} // namespace modxml
+
+#endif // MODXML_SAX_DELEGATE_HH
diff --git a/sax/inc/sax_error.hh b/sax/inc/sax_error.hh
new file mode 100644
index 0000000..748f995
--- /dev/null
+++ b/sax/inc/sax_error.hh
@@ -0,0 +1,36 @@
+#ifndef MODXML_SAX_ERROR_HH
+#define MODXML_SAX_ERROR_HH
+
+namespace modxml {
+namespace sax {
+
+enum class Error {
+ /**
+ * The XML spec has a list of characters that are never allowed in a document.
+ */
+ INVALID_CHAR,
+ /**
+ * If the document encoding is unsupported or unkown.
+ */
+ UNKNOWN_ENCODING,
+ /**
+ * If the document is incomplete. The is one of the few recoverable errors,
+ * if you call the processor with more data it will continue.
+ */
+ INCOMPLETE,
+ /**
+ * A entity in the document exeeded max buffer size (either set by
+ * ProcessBuilder or the default 10 MiB).
+ */
+ MAX_MEMORY_EXCEEDED,
+ /**
+ * A memory allocation failed. Note that this doesn't protect against
+ * usage of overallocated memory.
+ */
+ OUT_OF_MEMORY,
+};
+
+} // namespace sax
+} // namespace modxml
+
+#endif // MODXML_SAX_ERROR_HH
diff --git a/sax/inc/sax_processor.hh b/sax/inc/sax_processor.hh
new file mode 100644
index 0000000..7ca32f7
--- /dev/null
+++ b/sax/inc/sax_processor.hh
@@ -0,0 +1,37 @@
+#ifndef MODXML_SAX_PROCESSOR_HH
+#define MODXML_SAX_PROCESSOR_HH
+
+#include <memory>
+
+namespace modxml {
+namespace sax {
+
+class Delegate;
+
+/**
+ * The XML processor, or parser if you like that term better.
+ * Feed it data and the processor will give the delegate calls with events or
+ * possibly errors.
+ */
+class Processor {
+ public:
+ virtual ~Processor() = default;
+
+ /**
+ * Construct a Processor. Same as creating a ProcessorBuilder
+ * and not changing any options and just calling build.
+ */
+ static std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate);
+
+ protected:
+ Processor() = default;
+
+ private:
+ Processor(Processor const&) = delete;
+ Processor& operator=(Processor const&) = delete;
+};
+
+} // namespace sax
+} // namespace modxml
+
+#endif // MODXML_SAX_PROCESSOR_HH
diff --git a/sax/inc/sax_processor_builder.hh b/sax/inc/sax_processor_builder.hh
new file mode 100644
index 0000000..070fbbf
--- /dev/null
+++ b/sax/inc/sax_processor_builder.hh
@@ -0,0 +1,82 @@
+#ifndef MODXML_SAX_PROCESSOR_BUILDER_HH
+#define MODXML_SAX_PROCESSOR_BUILDER_HH
+
+#include <memory>
+#include <string>
+
+namespace modxml {
+namespace sax {
+
+class DecoderFactory;
+class Delegate;
+class Processor;
+
+/**
+ * Used to construct Processor's with options set if needed.
+ */
+class ProcessorBuilder {
+ public:
+ virtual ~ProcessorBuilder() = default;
+
+ /**
+ * Construct a ProcessorBuilder. All options are set to default.
+ */
+ static std::unique_ptr<ProcessorBuilder> create();
+
+ /**
+ * If you know the encoding of the data sent to the processor set it here,
+ * this will stop the processor from trying to autodetect and will ignore
+ * encoding in any xml declaration if found.
+ * If the encoding is unsupported/unknown the processor will fail with
+ * an error indicating this, same as if it read a xml declaration with
+ * an unsupported or unknown encoding.
+ */
+ virtual ProcessorBuilder* force_encoding(std::string const& str) = 0;
+
+ /**
+ * Set a decoder factory for encodings not supported by library.
+ * Library only calls this for encodings it doesn't support itself.
+ * Library supports UTF-8, UTF-16, UTF-32 and US-ASCII.
+ * If you want to force the decoder factory to be used, force a custom
+ * encoding with force_encoding above.
+ */
+ virtual ProcessorBuilder* custom_decoder_factory(
+ std::shared_ptr<DecoderFactory> custom_decoder_factory) = 0;
+
+ /**
+ * Set the default buffer size the processor should use.
+ * If you give a too small buffer size (such as zero) it will be ignored
+ * and a implementation specific minimum will be used instead.
+ * This is meant as a possible optimization and can be completely ignored.
+ * Note that the processor will allocate more data if it needed.
+ */
+ virtual ProcessorBuilder* set_default_buffer_size(std::size_t size) = 0;
+
+ /**
+ * Set the max buffer size the processor should use.
+ * If you have memory constraints this will block the processing of CDATA,
+ * or other entities from allocating more than the given size.
+ * Default is 10MiB.
+ */
+ virtual ProcessorBuilder* set_max_buffer_size(std::size_t size) = 0;
+
+ /**
+ * Call to construct a Processor with the options setup in this builder,
+ * using the delegate given as parameter.
+ * May be called multiple times, will create an unique Processor each time.
+ */
+ virtual std::unique_ptr<Processor> build(
+ std::shared_ptr<Delegate> delegate) const = 0;
+
+ protected:
+ ProcessorBuilder() = default;
+
+ private:
+ ProcessorBuilder(ProcessorBuilder const&) = delete;
+ ProcessorBuilder& operator=(ProcessorBuilder const&) = delete;
+};
+
+} // namespace sax
+} // namespace modxml
+
+#endif // MODXML_SAX_PROCESSOR_BUILDER_HH
diff --git a/sax/meson.build b/sax/meson.build
new file mode 100644
index 0000000..ccbdef4
--- /dev/null
+++ b/sax/meson.build
@@ -0,0 +1,22 @@
+deps = [
+ base_dep,
+ utf_dep,
+]
+
+inc = include_directories('inc')
+lib = shared_library(
+ 'modxmlsax',
+ 'src/decoder.cc',
+ 'src/sax_processor.cc',
+ 'src/sax_processor_builder.cc',
+ 'src/utils.cc',
+ dependencies: deps,
+ include_directories: inc,
+ install: true,
+)
+
+sax_dep = declare_dependency(
+ dependencies: deps,
+ include_directories: inc,
+ link_with: lib,
+)
diff --git a/sax/src/decoder.cc b/sax/src/decoder.cc
new file mode 100644
index 0000000..30b1735
--- /dev/null
+++ b/sax/src/decoder.cc
@@ -0,0 +1,321 @@
+#include "decoder.hh"
+
+#include "macros.hh"
+#include "sax_decoder.hh"
+#include "utf16.hh"
+#include "utf32.hh"
+#include "utf8.hh"
+#include "utf_error.hh"
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+class UtfDecoder : public Decoder {
+ public:
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ if (bom_ == -1) UNLIKELY {
+ std::size_t tmp = in_offset;
+ uint32_t ret = read(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+ if (ret == 0xfeff) {
+ // To allow offset to advance and to return, we need to
+ // read at least one more character completely.
+ ret = read(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+ bom_ = 1;
+ } else {
+ bom_ = 0;
+ }
+ in_offset = tmp;
+ out[out_offset++] = ret;
+ if (out_offset == out_size)
+ return State::GOOD;
+ }
+
+ do {
+ uint32_t ret = read(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ return State::GOOD;
+ }
+
+ protected:
+ UtfDecoder() = default;
+
+ virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0;
+
+ private:
+ int8_t bom_{-1};
+};
+
+class Utf8Decoder : public UtfDecoder {
+ public:
+ Utf8Decoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read8(data, offset);
+ }
+};
+
+class Utf16BeDecoder : public UtfDecoder {
+ public:
+ Utf16BeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read16be(data, offset);
+ }
+};
+
+class Utf16LeDecoder : public UtfDecoder {
+ public:
+ Utf16LeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read16le(data, offset);
+ }
+};
+
+class Utf32BeDecoder : public UtfDecoder {
+ public:
+ Utf32BeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read32be(data, offset);
+ }
+};
+
+class Utf32LeDecoder : public UtfDecoder {
+ public:
+ Utf32LeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read32le(data, offset);
+ }
+};
+
+class Utf16Decoder : public Decoder {
+ public:
+ Utf16Decoder() = default;
+
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ if (endian_ == -1) UNLIKELY {
+ std::size_t tmp = in_offset;
+ uint32_t ret = utf::read16be(in, tmp);
+ int8_t endian;
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+ if (ret == 0xfeff) {
+ endian = 1; // Big endian
+ } else if (ret == 0xfffe) {
+ endian = 0; // Little endian
+ } else {
+ return State::INVALID;
+ }
+
+ // To allow offset to advance and to return, we need to
+ // read at least one more character completely.
+ ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+
+ endian_ = endian;
+ in_offset = tmp;
+ out[out_offset++] = ret;
+ if (out_offset == out_size)
+ return State::GOOD;
+ }
+
+ if (endian_ == 1) {
+ do {
+ uint32_t ret = utf::read16be(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ } else {
+ do {
+ uint32_t ret = utf::read16le(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ }
+ return State::GOOD;
+ }
+
+ private:
+ int8_t endian_{-1};
+};
+
+class Utf32Decoder : public Decoder {
+ public:
+ Utf32Decoder() = default;
+
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ if (endian_ == -1) UNLIKELY {
+ std::size_t tmp = in_offset;
+ uint32_t ret = utf::read32be(in, tmp);
+ int8_t endian;
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ tmp = in_offset;
+ ret = utf::read32le(in, tmp);
+ if (ret == 0xfeff) {
+ endian = 0; // Little endian
+ } else {
+ return State::INVALID;
+ }
+ } else if (ret == 0xfeff) {
+ endian = 1; // Big endian
+ } else {
+ return State::INVALID;
+ }
+
+ // To allow offset to advance and to return, we need to
+ // read the next character completely.
+ ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+
+ endian_ = endian;
+ in_offset = tmp;
+ out[out_offset++] = ret;
+ if (out_offset == out_size)
+ return State::GOOD;
+ }
+
+ if (endian_ == 1) {
+ do {
+ uint32_t ret = utf::read32be(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ } else {
+ do {
+ uint32_t ret = utf::read32le(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ }
+ return State::GOOD;
+ }
+
+ private:
+ int8_t endian_{-1};
+};
+
+class AsciiDecoder : public Decoder {
+ public:
+ AsciiDecoder() = default;
+
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ do {
+ if (in_offset == in.size())
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ if (in[in_offset] & 0x80)
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ out[out_offset++] = in[in_offset++];
+ } while (out_offset < out_size);
+ return State::GOOD;
+ }
+};
+
+} // namespace
+
+std::unique_ptr<Decoder> create_utf8_decoder() {
+ return std::make_unique<Utf8Decoder>();
+}
+
+std::unique_ptr<Decoder> create_utf16be_decoder() {
+ return std::make_unique<Utf16BeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf16le_decoder() {
+ return std::make_unique<Utf16LeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf32be_decoder() {
+ return std::make_unique<Utf32BeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf32le_decoder() {
+ return std::make_unique<Utf32LeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf16_decoder() {
+ return std::make_unique<Utf16Decoder>();
+}
+
+std::unique_ptr<Decoder> create_utf32_decoder() {
+ return std::make_unique<Utf32Decoder>();
+}
+
+std::unique_ptr<Decoder> create_ascii_decoder() {
+ return std::make_unique<AsciiDecoder>();
+}
+
+} // namespace sax
+} // namespace modxml
+
diff --git a/sax/src/decoder.hh b/sax/src/decoder.hh
new file mode 100644
index 0000000..bd2a99a
--- /dev/null
+++ b/sax/src/decoder.hh
@@ -0,0 +1,33 @@
+#ifndef DECODER_HH
+#define DECODER_HH
+
+#include "macros.hh"
+
+#include <memory>
+
+namespace modxml {
+namespace sax {
+
+class Decoder;
+
+// UTF-8 with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf8_decoder();
+// UTF-16 with BOM
+std::unique_ptr<Decoder> HIDDEN create_utf16_decoder();
+// UTF-16BE with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf16be_decoder();
+// UTF-16LE with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf16le_decoder();
+// UTF-32 with BOM
+std::unique_ptr<Decoder> HIDDEN create_utf32_decoder();
+// UTF-32BE with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf32be_decoder();
+// UTF-32LE with optional BOM
+std::unique_ptr<Decoder> HIDDEN create_utf32le_decoder();
+// US-ASCII
+std::unique_ptr<Decoder> HIDDEN create_ascii_decoder();
+
+} // namespace sax
+} // namespace modxml
+
+#endif // DECODER_HH
diff --git a/sax/src/processor.hh b/sax/src/processor.hh
new file mode 100644
index 0000000..4a2de29
--- /dev/null
+++ b/sax/src/processor.hh
@@ -0,0 +1,27 @@
+#ifndef PROCESSOR_HH
+#define PROCESSOR_HH
+
+#include "macros.hh"
+
+#include <memory>
+#include <optional>
+#include <string>
+
+namespace modxml {
+namespace sax {
+
+class DecoderFactory;
+class Delegate;
+class Processor;
+
+std::unique_ptr<Processor> HIDDEN create_processor(
+ std::shared_ptr<Delegate> delegate,
+ std::shared_ptr<DecoderFactory> decoder_factory,
+ std::optional<std::string> force_encoding,
+ std::optional<std::size_t> default_buffer_size,
+ std::optional<std::size_t> max_buffer_size);
+
+} // namespace sax
+} // namespace modxml
+
+#endif // PROCESSOR_HH
diff --git a/sax/src/sax_processor.cc b/sax/src/sax_processor.cc
new file mode 100644
index 0000000..ea9f753
--- /dev/null
+++ b/sax/src/sax_processor.cc
@@ -0,0 +1,145 @@
+#include "sax_processor.hh"
+
+#include "sax_decoder.hh"
+#include "processor.hh"
+#include "utils.hh"
+
+#include <algorithm>
+#include <optional>
+#include <utility>
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+// 2.2 Characters
+// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+
+inline bool valid_char(uint32_t c) {
+ // Assume valid unicode (U+0 - U+10ffff except surrogate blocks)
+ if (c >= 0x20 && c <= 0xfffd)
+ return true;
+ if (c == 0x9 || c == 0xa || c == 0xd)
+ return true;
+ return c >= 0x10000;
+}
+
+// 2.3 Common Syntactic Constructs
+// [3] S ::= (#x20 | #x9 | #xD | #xA)+
+
+inline bool is_ws(uint32_t c) {
+ // Assume we already checked for valid_char.
+ return c <= 0x20;
+}
+
+// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
+// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+
+inline bool is_namestartchar(uint32_t c) {
+ if (c < 0x41 /* A */)
+ return c == 0x3a /* : */;
+ if (c <= 0x5a /* Z */)
+ return true;
+ if (c < 0x61 /* a */)
+ return c == 0x5f /* _ */;
+ if (c <= 0x7a /* z */)
+ return true;
+ if (c < 0xc0)
+ return false;
+ if (c < 0x300)
+ return c != 0xd7 && c != 0xf7;
+ if (c > 0x37d && c < 0x37f)
+ return false;
+ if (c > 0x1fff && c < 0x200c)
+ return false;
+ if (c > 0x200d && c < 0x2070)
+ return false;
+ if (c > 0x218f && c < 0x2c00)
+ return false;
+ if (c > 0x2fef && c < 0x3001)
+ return false;
+ // Already valid_char so don't check for surrogate pair here.
+ if (c > 0xdfff && c < 0xf900)
+ return false;
+ if (c > 0xfdcf && c < 0xfdf0)
+ return false;
+ if (c > 0xfffd && c < 0x10000)
+ return false;
+ return true;
+}
+
+inline bool is_namechar(uint32_t c) {
+ return is_namestartchar(c) || (c >= 0x2d /* - */ && c <= 0x2e /* . */) ||
+ (c >= 0x30 /* 0 */ && c <= 0x39 /* 9 */) || (c == 0xb7) ||
+ (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040);
+}
+
+/* [5] Name ::= NameStartChar (NameChar)*
+[6] Names ::= Name (#x20 Name)*
+[7] Nmtoken ::= (NameChar)+
+[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
+*/
+
+class ProcessorImpl : public Processor {
+ public:
+ ProcessorImpl(std::shared_ptr<Delegate> delegate,
+ std::shared_ptr<DecoderFactory> decoder_factory,
+ std::unique_ptr<Decoder> decoder,
+ std::size_t default_buffer_size,
+ std::size_t max_buffer_size)
+ : delegate_(std::move(delegate)),
+ decoder_factory_(std::move(decoder_factory)),
+ decoder_(std::move(decoder)),
+ default_buffer_size_(default_buffer_size),
+ max_buffer_size_(max_buffer_size) {}
+
+ private:
+ std::shared_ptr<Delegate> delegate_;
+ std::shared_ptr<DecoderFactory> decoder_factory_;
+ std::unique_ptr<Decoder> decoder_;
+ std::size_t default_buffer_size_;
+ std::size_t max_buffer_size_;
+};
+
+} // namespace
+
+std::unique_ptr<Processor> create_processor(
+ std::shared_ptr<Delegate> delegate,
+ std::shared_ptr<DecoderFactory> decoder_factory,
+ std::optional<std::string> force_encoding,
+ std::optional<std::size_t> opt_default_buffer_size,
+ std::optional<std::size_t> opt_max_buffer_size) {
+
+ std::unique_ptr<Decoder> decoder;
+ if (force_encoding.has_value()) {
+ decoder = pick_decoder_for_encoding(force_encoding.value(),
+ decoder_factory.get());
+ }
+
+ std::size_t default_buffer_size = 8192;
+ if (opt_default_buffer_size.has_value())
+ default_buffer_size = std::max(static_cast<std::size_t>(128),
+ opt_default_buffer_size.value());
+ // This value is documented in public headers. Do NOT change.
+ std::size_t max_buffer_size = 10 * 1024 * 1024;
+ // No validation for user set value. If it is too small MAX_MEMORY_EXCEEDED
+ // error will be thrown. If it is too large we will get OUT_OF_MEMORY or
+ // crash depending on platform.
+ if (opt_max_buffer_size.has_value())
+ max_buffer_size = opt_max_buffer_size.value();
+
+ return std::make_unique<ProcessorImpl>(std::move(delegate),
+ std::move(decoder_factory),
+ std::move(decoder),
+ default_buffer_size,
+ max_buffer_size);
+}
+
+std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate) {
+ return create_processor(std::move(delegate), nullptr,
+ std::nullopt, std::nullopt, std::nullopt);
+}
+
+} // namespace sax
+} // namespace modxml
diff --git a/sax/src/sax_processor_builder.cc b/sax/src/sax_processor_builder.cc
new file mode 100644
index 0000000..8817099
--- /dev/null
+++ b/sax/src/sax_processor_builder.cc
@@ -0,0 +1,62 @@
+#include "sax_processor_builder.hh"
+
+#include "processor.hh"
+#include "sax_processor.hh"
+
+#include <optional>
+#include <utility>
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+class ProcessorBuilderImpl : public ProcessorBuilder {
+ public:
+ ProcessorBuilder* force_encoding(std::string const& str) override {
+ force_encoding_ = str;
+ return this;
+ }
+
+ ProcessorBuilder* custom_decoder_factory(
+ std::shared_ptr<DecoderFactory> custom_decoder_factory) override {
+ decoder_factory_ = std::move(custom_decoder_factory);
+ return this;
+ }
+
+ ProcessorBuilder* set_default_buffer_size(std::size_t size) override {
+ default_buffer_size_ = size;
+ return this;
+ }
+
+ ProcessorBuilder* set_max_buffer_size(std::size_t size) override {
+ max_buffer_size_ = size;
+ return this;
+ }
+
+ std::unique_ptr<Processor> build(
+ std::shared_ptr<Delegate> delegate) const override {
+ return create_processor(std::move(delegate),
+ decoder_factory_,
+ force_encoding_,
+ default_buffer_size_,
+ max_buffer_size_);
+ }
+
+ ProcessorBuilderImpl() = default;
+
+ private:
+ std::shared_ptr<DecoderFactory> decoder_factory_;
+ std::optional<std::string> force_encoding_;
+ std::optional<std::size_t> default_buffer_size_;
+ std::optional<std::size_t> max_buffer_size_;
+};
+
+} // namespace
+
+std::unique_ptr<ProcessorBuilder> ProcessorBuilder::create() {
+ return std::make_unique<ProcessorBuilderImpl>();
+}
+
+} // namespace sax
+} // namespace modxml
diff --git a/sax/src/utils.cc b/sax/src/utils.cc
new file mode 100644
index 0000000..f0366d5
--- /dev/null
+++ b/sax/src/utils.cc
@@ -0,0 +1,70 @@
+#include "utils.hh"
+
+#include "decoder.hh"
+#include "sax_decoder.hh"
+#include "sax_decoder_factory.hh"
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+std::string cleanup_encoding(std::string const& str) {
+ std::string ret;
+ ret.reserve(str.size());
+ for (auto c : str) {
+ if (c >= 'A' && c <= 'Z') {
+ ret.push_back(c | 0x20);
+ } else if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
+ ret.push_back(c);
+ } else if (c == '.' || c == '_' || c == '-') {
+ ret.push_back('-');
+ }
+ }
+ return ret;
+}
+
+} // namespace
+
+// Names inspired by:
+// https://www.iana.org/assignments/character-sets/character-sets.xhtml
+std::unique_ptr<Decoder> pick_decoder_for_encoding(
+ std::string const& encoding, DecoderFactory* factory) {
+ auto clean_enc = cleanup_encoding(encoding);
+ if (clean_enc == "utf-8" || clean_enc == "utf8") {
+ return create_utf8_decoder();
+ }
+ if (clean_enc == "utf-16" || clean_enc == "utf16") {
+ return create_utf16_decoder();
+ }
+ if (clean_enc == "utf-16be" || clean_enc == "utf16be") {
+ return create_utf16be_decoder();
+ }
+ if (clean_enc == "utf-16le" || clean_enc == "utf16le") {
+ return create_utf16le_decoder();
+ }
+ if (clean_enc == "utf-32" || clean_enc == "utf32") {
+ return create_utf32_decoder();
+ }
+ if (clean_enc == "utf-32be" || clean_enc == "utf32be") {
+ return create_utf32be_decoder();
+ }
+ if (clean_enc == "utf-32le" || clean_enc == "utf32le") {
+ return create_utf32le_decoder();
+ }
+ if (clean_enc == "ascii" || clean_enc == "us-ascii" ||
+ clean_enc == "usascii" || clean_enc == "iso-ir-6" ||
+ clean_enc == "ansi-x3-4-1968" || clean_enc == "ansi-x3-4-1986" ||
+ clean_enc == "iso-646-irv1991" || clean_enc == "iso646-us" ||
+ clean_enc == "us" || clean_enc == "ibm367" || clean_enc == "cp367") {
+ return create_ascii_decoder();
+ }
+ if (factory) {
+ return factory->create(encoding);
+ }
+ return nullptr;
+}
+
+} // namespace sax
+
+} // namespace modxml
diff --git a/sax/src/utils.hh b/sax/src/utils.hh
new file mode 100644
index 0000000..206d003
--- /dev/null
+++ b/sax/src/utils.hh
@@ -0,0 +1,22 @@
+#ifndef UTILS_HH
+#define UTILS_HH
+
+#include "macros.hh"
+
+#include <memory>
+#include <string>
+
+namespace modxml {
+namespace sax {
+
+class Decoder;
+class DecoderFactory;
+
+std::unique_ptr<Decoder> HIDDEN pick_decoder_for_encoding(
+ std::string const& encoding,
+ DecoderFactory* factory);
+
+} // namespace sax
+} // namespace modxml
+
+#endif // UTILS_HH
diff --git a/scripts/iwyu.sh b/scripts/iwyu.sh
new file mode 100644
index 0000000..dfdd651
--- /dev/null
+++ b/scripts/iwyu.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+iwyu_tool=iwyu-tool
+
+if ! command -v "$iwyu_tool" &> /dev/null; then
+ iwyu_tool=iwyu_tool.py
+fi
+
+cpus=`grep processor /proc/cpuinfo | wc -l`
+
+exec $iwyu_tool -o clang -j $cpus -p "${MESON_BUILD_ROOT}" -- -Xiwyu --mapping_file="${MESON_SOURCE_ROOT}"/iwyu-mappings.imp
diff --git a/subprojects/.gitignore b/subprojects/.gitignore
new file mode 100644
index 0000000..847cb91
--- /dev/null
+++ b/subprojects/.gitignore
@@ -0,0 +1,2 @@
+googletest-1.13.0/
+packagecache/
diff --git a/subprojects/gtest.wrap b/subprojects/gtest.wrap
new file mode 100644
index 0000000..195aaca
--- /dev/null
+++ b/subprojects/gtest.wrap
@@ -0,0 +1,15 @@
+[wrap-file]
+directory = googletest-1.13.0
+source_url = https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz
+source_filename = gtest-1.13.0.tar.gz
+source_hash = ad7fdba11ea011c1d925b3289cf4af2c66a352e18d4c7264392fead75e919363
+patch_filename = gtest_1.13.0-1_patch.zip
+patch_url = https://wrapdb.mesonbuild.com/v2/gtest_1.13.0-1/get_patch
+patch_hash = 6d82a02c3a45071cea989983bf6becde801cbbfd29196ba30dada0215393b082
+wrapdb_version = 1.13.0-1
+
+[provide]
+gtest = gtest_dep
+gtest_main = gtest_main_dep
+gmock = gmock_dep
+gmock_main = gmock_main_dep
diff --git a/utf/inc/utf16.hh b/utf/inc/utf16.hh
new file mode 100644
index 0000000..344b1a2
--- /dev/null
+++ b/utf/inc/utf16.hh
@@ -0,0 +1,31 @@
+#ifndef UTF_UTF16_HH
+#define UTF_UTF16_HH
+
+#include "macros.hh"
+
+#include <cstdint>
+#include <string_view>
+
+namespace utf {
+
+/* Read one unicode codepoint from UTF-16 BigEndian encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs,
+ * returns INVALID.
+ */
+uint32_t HIDDEN read16be(std::string_view data, std::size_t& offset);
+
+/* Read one unicode codepoint from UTF-16 LittleEndian encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs,
+ * returns INVALID.
+ */
+uint32_t HIDDEN read16le(std::string_view data, std::size_t& offset);
+
+} // namespace utf
+
+#endif // UTF_UTF16_HH
diff --git a/utf/inc/utf32.hh b/utf/inc/utf32.hh
new file mode 100644
index 0000000..2d3088e
--- /dev/null
+++ b/utf/inc/utf32.hh
@@ -0,0 +1,29 @@
+#ifndef UTF_UTF32_HH
+#define UTF_UTF32_HH
+
+#include "macros.hh"
+
+#include <cstdint>
+#include <string_view>
+
+namespace utf {
+
+/* Read one unicode codepoint from UTF-32 BigEndian encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-32, ie. outside valid ranges, returns INVALID.
+ */
+uint32_t HIDDEN read32be(std::string_view data, std::size_t& offset);
+
+/* Read one unicode codepoint from UTF-32 LittleEndian encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-32, ie. outside valid ranges, returns INVALID.
+ */
+uint32_t HIDDEN read32le(std::string_view data, std::size_t& offset);
+
+} // namespace utf
+
+#endif // UTF_UTF32_HH
diff --git a/utf/inc/utf8.hh b/utf/inc/utf8.hh
new file mode 100644
index 0000000..a3ea84a
--- /dev/null
+++ b/utf/inc/utf8.hh
@@ -0,0 +1,22 @@
+#ifndef UTF_UTF8_HH
+#define UTF_UTF8_HH
+
+#include "macros.hh"
+
+#include <cstdint>
+#include <string_view>
+
+namespace utf {
+
+/* Read one unicode codepoint from UTF-8 encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-8, this includes overlong encodings and
+ * invalid unicode code points, returns INVALID.
+ */
+uint32_t HIDDEN read8(std::string_view data, std::size_t& offset);
+
+} // namespace utf
+
+#endif // UTF_UTF8_HH
diff --git a/utf/inc/utf_error.hh b/utf/inc/utf_error.hh
new file mode 100644
index 0000000..079fa43
--- /dev/null
+++ b/utf/inc/utf_error.hh
@@ -0,0 +1,13 @@
+#ifndef UTF_ERROR_HH
+#define UTF_ERROR_HH
+
+#include <cstdint>
+
+namespace utf {
+
+constexpr uint32_t NEED_MORE = 0xfffffffe;
+constexpr uint32_t INVALID = 0xffffffff;
+
+} // namespace utf
+
+#endif // UTF_ERROR_HH
diff --git a/utf/meson.build b/utf/meson.build
new file mode 100644
index 0000000..64db6ff
--- /dev/null
+++ b/utf/meson.build
@@ -0,0 +1,38 @@
+deps = [
+ base_dep,
+]
+
+inc = include_directories('inc')
+lib = static_library(
+ 'utf',
+ 'src/utf8.cc',
+ 'src/utf16.cc',
+ 'src/utf32.cc',
+ dependencies: deps,
+ include_directories: inc,
+ install: false,
+)
+
+utf_dep = declare_dependency(
+ dependencies: deps,
+ include_directories: inc,
+ link_with: lib,
+)
+
+test('utf8',
+ executable(
+ 'test_utf8',
+ sources: ['tst/test_utf8.cc'],
+ dependencies: [utf_dep, gtest_dep]))
+
+test('utf16',
+ executable(
+ 'test_utf16',
+ sources: ['tst/test_utf16.cc'],
+ dependencies: [utf_dep, gtest_dep]))
+
+test('utf32',
+ executable(
+ 'test_utf32',
+ sources: ['tst/test_utf32.cc'],
+ dependencies: [utf_dep, gtest_dep]))
diff --git a/utf/src/utf16.cc b/utf/src/utf16.cc
new file mode 100644
index 0000000..43595bf
--- /dev/null
+++ b/utf/src/utf16.cc
@@ -0,0 +1,67 @@
+#include "utf16.hh"
+
+#include "utf_error.hh"
+
+namespace utf {
+
+namespace {
+
+inline bool is_high_surrogate(uint16_t c) {
+ return c >= 0xd800 && c <= 0xdbff;
+}
+
+inline bool is_low_surrogate(uint16_t c) {
+ return c >= 0xdc00 && c <= 0xdfff;
+}
+
+} // namespace
+
+uint32_t read16be(std::string_view data, std::size_t& offset) {
+ if (offset > data.size() || data.size() - offset < 2)
+ return NEED_MORE;
+ uint16_t c = static_cast<uint16_t>(data[offset]) << 8
+ | static_cast<uint16_t>(data[offset + 1] & 0xff);
+ if (is_high_surrogate(c)) {
+ if (data.size() - offset < 4)
+ return NEED_MORE;
+ uint16_t d = static_cast<uint16_t>(data[offset + 2]) << 8
+ | static_cast<uint16_t>(data[offset + 3] & 0xff);
+ if (is_low_surrogate(d)) {
+ offset += 4;
+ return 0x10000
+ + (static_cast<uint32_t>(c & 0x3ff) << 10
+ | (d & 0x3ff));
+ }
+ return INVALID;
+ } else if (is_low_surrogate(c)) {
+ return INVALID;
+ }
+ offset += 2;
+ return c;
+}
+
+uint32_t read16le(std::string_view data, std::size_t& offset) {
+ if (offset > data.size() || data.size() - offset < 2)
+ return NEED_MORE;
+ uint16_t c = static_cast<uint16_t>(data[offset + 1]) << 8
+ | static_cast<uint16_t>(data[offset] & 0xff);
+ if (is_high_surrogate(c)) {
+ if (data.size() - offset < 4)
+ return NEED_MORE;
+ uint16_t d = static_cast<uint16_t>(data[offset + 3]) << 8
+ | static_cast<uint16_t>(data[offset + 2] & 0xff);
+ if (is_low_surrogate(d)) {
+ offset += 4;
+ return 0x10000
+ + (static_cast<uint32_t>(c & 0x3ff) << 10
+ | (d & 0x3ff));
+ }
+ return INVALID;
+ } else if (is_low_surrogate(c)) {
+ return INVALID;
+ }
+ offset += 2;
+ return c;
+}
+
+} // namespace utf
diff --git a/utf/src/utf32.cc b/utf/src/utf32.cc
new file mode 100644
index 0000000..cfa29b6
--- /dev/null
+++ b/utf/src/utf32.cc
@@ -0,0 +1,43 @@
+#include "utf32.hh"
+
+#include "utf_error.hh"
+
+namespace utf {
+
+namespace {
+
+inline bool valid_codepoint(uint32_t c) {
+ return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff);
+}
+
+} // namespace
+
+uint32_t read32be(std::string_view data, std::size_t& offset) {
+ if (offset > data.size() || data.size() - offset < 4)
+ return NEED_MORE;
+ uint32_t c = static_cast<uint32_t>(data[offset]) << 24
+ | static_cast<uint32_t>(data[offset + 1] & 0xff) << 16
+ | static_cast<uint32_t>(data[offset + 2] & 0xff) << 8
+ | static_cast<uint32_t>(data[offset + 3] & 0xff);
+ if (valid_codepoint(c)) {
+ offset += 4;
+ return c;
+ }
+ return INVALID;
+}
+
+uint32_t read32le(std::string_view data, std::size_t& offset) {
+ if (offset > data.size() || data.size() - offset < 4)
+ return NEED_MORE;
+ uint32_t c = static_cast<uint32_t>(data[offset + 3]) << 24
+ | static_cast<uint32_t>(data[offset + 2] & 0xff) << 16
+ | static_cast<uint32_t>(data[offset + 1] & 0xff) << 8
+ | static_cast<uint32_t>(data[offset] & 0xff);
+ if (valid_codepoint(c)) {
+ offset += 4;
+ return c;
+ }
+ return INVALID;
+}
+
+} // namespace utf
diff --git a/utf/src/utf8.cc b/utf/src/utf8.cc
new file mode 100644
index 0000000..54b0296
--- /dev/null
+++ b/utf/src/utf8.cc
@@ -0,0 +1,68 @@
+#include "utf8.hh"
+
+#include "utf_error.hh"
+
+namespace utf {
+
+namespace {
+
+inline bool valid_codepoint(uint32_t c) {
+ return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff);
+}
+
+} // namespace
+
+uint32_t read8(std::string_view data, std::size_t& offset) {
+ if (offset >= data.size())
+ return NEED_MORE;
+ uint32_t ret;
+ uint8_t size;
+ switch (static_cast<uint8_t>(data[offset]) >> 4) {
+ case 15:
+ if (data[offset] & 0x08)
+ return INVALID;
+ ret = static_cast<uint32_t>(data[offset] & 0x07) << 18;
+ size = 4;
+ break;
+ case 14:
+ ret = static_cast<uint32_t>(data[offset] & 0x0f) << 12;
+ size = 3;
+ break;
+ case 13:
+ case 12:
+ ret = static_cast<uint32_t>(data[offset] & 0x1f) << 6;
+ size = 2;
+ break;
+ default:
+ if (data[offset] & 0x80)
+ return INVALID;
+ return data[offset++];
+ }
+ if (data.size() - offset < size)
+ return NEED_MORE;
+ for (uint8_t i = 1; i < size; ++i) {
+ if ((data[offset + i] & 0xc0) != 0x80)
+ return INVALID;
+ ret |= static_cast<uint32_t>(data[offset + i] & 0x3f) << (size - i - 1) * 6;
+ }
+ if (!valid_codepoint(ret))
+ return INVALID;
+ switch (size) {
+ case 4:
+ if (ret < 0x10000)
+ return INVALID;
+ break;
+ case 3:
+ if (ret < 0x800)
+ return INVALID;
+ break;
+ case 2:
+ if (ret < 0x80)
+ return INVALID;
+ break;
+ }
+ offset += size;
+ return ret;
+}
+
+} // namespace utf
diff --git a/utf/tst/test_utf16.cc b/utf/tst/test_utf16.cc
new file mode 100644
index 0000000..c17982e
--- /dev/null
+++ b/utf/tst/test_utf16.cc
@@ -0,0 +1,157 @@
+#include "utf16.hh"
+
+#include "utf_error.hh"
+
+#include <gtest/gtest.h>
+
+TEST(utf16be, sanity) {
+ std::string_view str("\x00\x24", 2);
+ size_t offset = 0;
+ auto ret = utf::read16be(str, offset);
+ EXPECT_EQ('$', ret);
+ EXPECT_EQ(2, offset);
+
+ str = "\x20\xAC";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ EXPECT_EQ(2, offset);
+
+ str = "\xD8\x01\xDC\x37";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(0x10437, ret);
+ EXPECT_EQ(4, offset);
+
+ str = "\xD8\x52\xDF\x62";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(0x24B62, ret);
+ EXPECT_EQ(4, offset);
+}
+
+TEST(utf16le, sanity) {
+ std::string_view str("\x24\x00", 2);
+ size_t offset = 0;
+ auto ret = utf::read16le(str, offset);
+ EXPECT_EQ('$', ret);
+ EXPECT_EQ(2, offset);
+
+ str = "\xAC\x20";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ EXPECT_EQ(2, offset);
+
+ str = "\x01\xD8\x37\xDC";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(0x10437, ret);
+ EXPECT_EQ(4, offset);
+
+ str = "\x52\xD8\x62\xDF";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(0x24B62, ret);
+ EXPECT_EQ(4, offset);
+}
+
+TEST(utf16be, bom) {
+ std::string_view str("\xFE\xFF\x20\xAC");
+ size_t offset = 0;
+ auto ret = utf::read16be(str, offset);
+ EXPECT_EQ(0xFEFF, ret);
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(str.size(), offset);
+}
+
+TEST(utf16le, bom) {
+ std::string_view str("\xFF\xFE\xAC\x20");
+ size_t offset = 0;
+ auto ret = utf::read16le(str, offset);
+ EXPECT_EQ(0xFEFF, ret);
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(str.size(), offset);
+}
+
+TEST(utf16be, invalid) {
+ std::string_view str("\xD8");
+ size_t offset = 0;
+ auto ret = utf::read16be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xD8\x01";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xD8\x01\xDC";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xDC\x37\xD8\x01";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xD8\x01\xD8\x01";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+}
+
+TEST(utf16le, invalid) {
+ std::string_view str("\x01");
+ size_t offset = 0;
+ auto ret = utf::read16le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\x01\xD8";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\x01\xD8\x37";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\x37\xDC\x01\xD8";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\x01\xD8\x01\xD8";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+}
diff --git a/utf/tst/test_utf32.cc b/utf/tst/test_utf32.cc
new file mode 100644
index 0000000..796b4cd
--- /dev/null
+++ b/utf/tst/test_utf32.cc
@@ -0,0 +1,145 @@
+#include "utf32.hh"
+
+#include "utf_error.hh"
+
+#include <gtest/gtest.h>
+
+TEST(utf32be, sanity) {
+ std::string_view str("\x00\x00\x00\x24", 4);
+ size_t offset = 0;
+ auto ret = utf::read32be(str, offset);
+ EXPECT_EQ('$', ret);
+ EXPECT_EQ(4, offset);
+
+ str = std::string_view("\x00\x00\x20\xAC", 4);
+ offset = 0;
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ EXPECT_EQ(4, offset);
+
+ str = std::string_view("\x00\x01\x04\x37", 4);
+ offset = 0;
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(0x10437, ret);
+ EXPECT_EQ(4, offset);
+}
+
+TEST(utf32le, sanity) {
+ std::string_view str("\x24\x00\x00\x00", 4);
+ size_t offset = 0;
+ auto ret = utf::read32le(str, offset);
+ EXPECT_EQ('$', ret);
+ EXPECT_EQ(4, offset);
+
+ str = std::string_view("\xAC\x20\x00\x00", 4);
+ offset = 0;
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ EXPECT_EQ(4, offset);
+
+ str = std::string_view("\x37\x04\x01\x00", 4);
+ offset = 0;
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(0x10437, ret);
+ EXPECT_EQ(4, offset);
+}
+
+TEST(utf32be, invalid) {
+ std::string_view str("\xFF\xFF\xFF\xFF");
+ size_t offset = 0;
+ auto ret = utf::read32be(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00\x00\xD8\x00", 4);
+ offset = 0;
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "";
+ offset = 0;
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00", 1);
+ offset = 0;
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00\x00", 2);
+ offset = 0;
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00\x00\x00", 3);
+ offset = 0;
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+}
+
+TEST(utf32le, invalid) {
+ std::string_view str("\xFF\xFF\xFF\xFF");
+ size_t offset = 0;
+ auto ret = utf::read32le(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00\xD8\x00\x00", 4);
+ offset = 0;
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "";
+ offset = 0;
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00", 1);
+ offset = 0;
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00\x00", 2);
+ offset = 0;
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00\x00\x00", 3);
+ offset = 0;
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+}
+
+TEST(utf32be, bom) {
+ std::string_view str("\x00\x00\xFF\xFE\x00\x00\x20\xAC", 8);
+ size_t offset = 0;
+ auto ret = utf::read32be(str, offset);
+ EXPECT_EQ(0xFFFE, ret);
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(str.size(), offset);
+}
+
+TEST(utf32le, bom) {
+ std::string_view str("\xFE\xFF\x00\x00\xAC\x20\x00\x00", 8);
+ size_t offset = 0;
+ auto ret = utf::read32le(str, offset);
+ EXPECT_EQ(0xFFFE, ret);
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(str.size(), offset);
+}
diff --git a/utf/tst/test_utf8.cc b/utf/tst/test_utf8.cc
new file mode 100644
index 0000000..10df969
--- /dev/null
+++ b/utf/tst/test_utf8.cc
@@ -0,0 +1,188 @@
+#include "utf8.hh"
+
+#include "utf_error.hh"
+
+#include <gtest/gtest.h>
+
+TEST(utf8, sanity) {
+ std::string_view str("$");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ('$', ret);
+ EXPECT_EQ(1, offset);
+
+ str = "\xC2\xA3";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0xa3, ret);
+ EXPECT_EQ(2, offset);
+
+ str = "\xD0\x98";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x418, ret);
+ EXPECT_EQ(2, offset);
+
+ str = "\xE0\xA4\xB9";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x939, ret);
+ EXPECT_EQ(3, offset);
+
+ str = "\xE2\x82\xAC";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ EXPECT_EQ(3, offset);
+
+ str = "\xED\x95\x9C";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0xD55C, ret);
+ EXPECT_EQ(3, offset);
+
+ str = "\xF0\x90\x8D\x88";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x10348, ret);
+ EXPECT_EQ(4, offset);
+}
+
+TEST(utf8, overlong) {
+ std::string_view str("\xF0\x82\x82\xAC");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xE0\x81\x81";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xC0\x80";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+}
+
+TEST(utf8, invalid) {
+ std::string_view str("\xED\xB0\x80");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xFB\xFF\xFF";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xFF\xFF\xFF\xFF\xFF";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\x80";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xC2";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xC2\x03";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xE0\xA4";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xF0\x90\x8D";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+}
+
+TEST(utf8, multiple1) {
+ std::string_view str("\x4D\xC3\xAC\x6E\x68\x20\x6E\xC3\xB3\x69\x20\x74\x69"
+ "\xE1\xBA\xBF\x6E\x67\x20\x56\x69\xE1\xBB\x87\x74");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ('M', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0xEC, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('n', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('h', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(' ', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('n', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0xF3, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('i', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(' ', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('t', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('i', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x1EBF, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('n', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('g', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(' ', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('V', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('i', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x1EC7, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('t', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(str.size(), offset);
+}
+
+TEST(utf8, multiple2) {
+ std::string_view str("\xF0\xA8\x89\x9F\xE5\x91\x90\xE3\x97\x82\xE8\xB6\x8A");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ(0x2825F, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x5450, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x35C2, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x8D8A, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(str.size(), offset);
+}