diff options
| author | Joel Klinghed <the_jk@spawned.biz> | 2023-06-13 10:07:16 +0200 |
|---|---|---|
| committer | Joel Klinghed <the_jk@spawned.biz> | 2023-06-13 10:07:16 +0200 |
| commit | fc4547b412e28164af1bf8981234c6af959ccc0b (patch) | |
| tree | 061253e7a4f6abaca282223b36d10f0bed8cad23 | |
WIP
33 files changed, 1885 insertions, 0 deletions
diff --git a/.dir-locals.el b/.dir-locals.el new file mode 100644 index 0000000..484f453 --- /dev/null +++ b/.dir-locals.el @@ -0,0 +1,17 @@ +;;; Directory Local Variables +;;; For more information see (info "(emacs) Directory Variables") + +((c-mode + . + ((eval . + (let ((project-path + (locate-dominating-file default-directory ".dir-locals.el"))) + (setq-local flycheck-clangcheck-build-path + (concat project-path "build")))))) + (c++-mode + . + ((eval . + (let ((project-path + (locate-dominating-file default-directory ".dir-locals.el"))) + (setq-local flycheck-clangcheck-build-path + (concat project-path "build"))))))) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fd279fc --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/build/ +/build-rel/ +/build-cov/ diff --git a/base/inc/macros.hh b/base/inc/macros.hh new file mode 100644 index 0000000..6d88669 --- /dev/null +++ b/base/inc/macros.hh @@ -0,0 +1,16 @@ +#ifndef BASE_MACROS_HH +#define BASE_MACROS_HH + +#if defined(HAVE_ATTRIBUTE_VISIBILITY_HIDDEN) +# define HIDDEN __attribute__((visibility ("hidden"))) +#else +# define HIDDEN +#endif + +#if defined(HAVE_ATTRIBUTE_UNLIKELY) +# define UNLIKELY [[unlikely]] +#else +# define UNLIKELY +#endif + +#endif // BASE_MACROS_HH diff --git a/base/meson.build b/base/meson.build new file mode 100644 index 0000000..71faace --- /dev/null +++ b/base/meson.build @@ -0,0 +1,19 @@ +cpp = meson.get_compiler('cpp') +cpp_flags = [] +if cpp.has_function_attribute('visibility:hidden') + cpp_flags += '-DHAVE_ATTRIBUTE_VISIBILITY_HIDDEN' +endif +if cpp.compiles('''int foo() { + [[unlikely]] + return 0; +}''', name: 'C++20 unlikely attribute') + cpp_flags += '-DHAVE_ATTRIBUTE_UNLIKELY' + cpp_flags += '-Wno-c++20-attribute-extensions' +endif + +inc = include_directories('inc') + +base_dep = declare_dependency( + compile_args: cpp_flags, + include_directories: inc, +) diff --git a/meson.build b/meson.build new file mode 100644 index 0000000..2d571dc --- /dev/null +++ b/meson.build @@ -0,0 +1,30 @@ +project( + 'libmodxml', 'cpp', + version : '0.1', + meson_version: '>= 0.58', + default_options : [ + 'warning_level=3', + 'cpp_std=c++17', + 'cpp_rtti=false', + 'cpp_eh=none', + 'b_ndebug=if-release', + ], +) + +gtest_dep = dependency( + 'gtest', + version: '>= 1.10.0', + main: true, + fallback: ['gtest', 'gtest_main_dep']) + +gmock_dep = dependency( + 'gmock', + version: '>= 1.10.0', + main: false, + fallback: ['gtest', 'gmock_dep']) + +subdir('base') +subdir('utf') +subdir('sax') + +run_target('iwyu', command: 'scripts/iwyu.sh') diff --git a/sax/inc/sax_decoder.hh b/sax/inc/sax_decoder.hh new file mode 100644 index 0000000..40a56c9 --- /dev/null +++ b/sax/inc/sax_decoder.hh @@ -0,0 +1,57 @@ +#ifndef SAX_DECODER_HH +#define SAX_DECODER_HH + +#include <memory> +#include <string> +#include <string_view> + +namespace modxml { +namespace sax { + +/** + * Decoder returned by DecoderFactory. Used by Processor to turn bytes into + * unicode characters. + */ +class Decoder { + public: + virtual ~Decoder() = default; + + enum class State { + GOOD = 0, + // too little data was given to advance + NEED_MORE, + // invalid data was given to advance + INVALID, + }; + + /** + * Decode as many code points as possible from in (start at in_offset) and + * write them to out (start at out_offset) as UTF-8. + * All written code points must be valid per Unicode, so inside the + * range U+0 to U+10FFFF and not a surrogate pair (U+D800-U+DFFF). + * No partial output, only write to out if the whole UTF-8 sequence is + * going to fit. + * The is always at least 4 bytes available (out.size() - out_offset) when + * called. + * Advance in_offset for data consumed. + * Advance out_offset for code points written. Do NOT write past out.size(). + * Do NOT resize out. + * If at least one code point is decoded and written to out, return GOOD. + * If it is not possible to decode a single code point, in_offset and + * out_offset should not be advanced and something other than GOOD returned. + * Do not keep any references to any of the parameters after returning, next + * advance() call will point to the following bytes, but all parameters + * may have changed as they are subject to the buffer implementations of the + * Processor. + */ + virtual State decode(std::string_view in, std::size_t& in_offset, + std::string& out, std::size_t& out_offset) = 0; + + protected: + Decoder() = default; +}; + +} // namespace sax +} // namespace modxml + +#endif // SAX_DECODER_HH diff --git a/sax/inc/sax_decoder_factory.hh b/sax/inc/sax_decoder_factory.hh new file mode 100644 index 0000000..80f1af3 --- /dev/null +++ b/sax/inc/sax_decoder_factory.hh @@ -0,0 +1,35 @@ +#ifndef SAX_DECODER_FACTORY_HH +#define SAX_DECODER_FACTORY_HH + +#include <memory> +#include <string> + +namespace modxml { +namespace sax { + +class Decoder; + +/** + * Factory for decoders. You can give one to ProcessBuilder. + */ +class DecoderFactory { + public: + virtual ~DecoderFactory() = default; + + /** + * If encoding is supported, return a decoder for that encoding. + * Return nullptr if not supported and Processor will return + * UNKNOWN_ENCODING error. + * Note that encoding value isn't cleaned up or validated in any way, it is + * reported EXACTLY as found (even if not valid per XML spec). + */ + virtual std::unique_ptr<Decoder> create(std::string const& encoding) = 0; + + protected: + DecoderFactory() = default; +}; + +} // namespace sax +} // namespace modxml + +#endif // SAX_DECODER_FACTORY_HH diff --git a/sax/inc/sax_delegate.hh b/sax/inc/sax_delegate.hh new file mode 100644 index 0000000..ba63e72 --- /dev/null +++ b/sax/inc/sax_delegate.hh @@ -0,0 +1,22 @@ +#ifndef MODXML_SAX_DELEGATE_HH +#define MODXML_SAX_DELEGATE_HH + +namespace modxml { +namespace sax { + +/** + * Delegate for processor. + * Implement to handle events. + */ +class Delegate { + public: + virtual ~Delegate() = default; + + protected: + Delegate() = default; +}; + +} // namespace sax +} // namespace modxml + +#endif // MODXML_SAX_DELEGATE_HH diff --git a/sax/inc/sax_error.hh b/sax/inc/sax_error.hh new file mode 100644 index 0000000..748f995 --- /dev/null +++ b/sax/inc/sax_error.hh @@ -0,0 +1,36 @@ +#ifndef MODXML_SAX_ERROR_HH +#define MODXML_SAX_ERROR_HH + +namespace modxml { +namespace sax { + +enum class Error { + /** + * The XML spec has a list of characters that are never allowed in a document. + */ + INVALID_CHAR, + /** + * If the document encoding is unsupported or unkown. + */ + UNKNOWN_ENCODING, + /** + * If the document is incomplete. The is one of the few recoverable errors, + * if you call the processor with more data it will continue. + */ + INCOMPLETE, + /** + * A entity in the document exeeded max buffer size (either set by + * ProcessBuilder or the default 10 MiB). + */ + MAX_MEMORY_EXCEEDED, + /** + * A memory allocation failed. Note that this doesn't protect against + * usage of overallocated memory. + */ + OUT_OF_MEMORY, +}; + +} // namespace sax +} // namespace modxml + +#endif // MODXML_SAX_ERROR_HH diff --git a/sax/inc/sax_processor.hh b/sax/inc/sax_processor.hh new file mode 100644 index 0000000..7ca32f7 --- /dev/null +++ b/sax/inc/sax_processor.hh @@ -0,0 +1,37 @@ +#ifndef MODXML_SAX_PROCESSOR_HH +#define MODXML_SAX_PROCESSOR_HH + +#include <memory> + +namespace modxml { +namespace sax { + +class Delegate; + +/** + * The XML processor, or parser if you like that term better. + * Feed it data and the processor will give the delegate calls with events or + * possibly errors. + */ +class Processor { + public: + virtual ~Processor() = default; + + /** + * Construct a Processor. Same as creating a ProcessorBuilder + * and not changing any options and just calling build. + */ + static std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate); + + protected: + Processor() = default; + + private: + Processor(Processor const&) = delete; + Processor& operator=(Processor const&) = delete; +}; + +} // namespace sax +} // namespace modxml + +#endif // MODXML_SAX_PROCESSOR_HH diff --git a/sax/inc/sax_processor_builder.hh b/sax/inc/sax_processor_builder.hh new file mode 100644 index 0000000..070fbbf --- /dev/null +++ b/sax/inc/sax_processor_builder.hh @@ -0,0 +1,82 @@ +#ifndef MODXML_SAX_PROCESSOR_BUILDER_HH +#define MODXML_SAX_PROCESSOR_BUILDER_HH + +#include <memory> +#include <string> + +namespace modxml { +namespace sax { + +class DecoderFactory; +class Delegate; +class Processor; + +/** + * Used to construct Processor's with options set if needed. + */ +class ProcessorBuilder { + public: + virtual ~ProcessorBuilder() = default; + + /** + * Construct a ProcessorBuilder. All options are set to default. + */ + static std::unique_ptr<ProcessorBuilder> create(); + + /** + * If you know the encoding of the data sent to the processor set it here, + * this will stop the processor from trying to autodetect and will ignore + * encoding in any xml declaration if found. + * If the encoding is unsupported/unknown the processor will fail with + * an error indicating this, same as if it read a xml declaration with + * an unsupported or unknown encoding. + */ + virtual ProcessorBuilder* force_encoding(std::string const& str) = 0; + + /** + * Set a decoder factory for encodings not supported by library. + * Library only calls this for encodings it doesn't support itself. + * Library supports UTF-8, UTF-16, UTF-32 and US-ASCII. + * If you want to force the decoder factory to be used, force a custom + * encoding with force_encoding above. + */ + virtual ProcessorBuilder* custom_decoder_factory( + std::shared_ptr<DecoderFactory> custom_decoder_factory) = 0; + + /** + * Set the default buffer size the processor should use. + * If you give a too small buffer size (such as zero) it will be ignored + * and a implementation specific minimum will be used instead. + * This is meant as a possible optimization and can be completely ignored. + * Note that the processor will allocate more data if it needed. + */ + virtual ProcessorBuilder* set_default_buffer_size(std::size_t size) = 0; + + /** + * Set the max buffer size the processor should use. + * If you have memory constraints this will block the processing of CDATA, + * or other entities from allocating more than the given size. + * Default is 10MiB. + */ + virtual ProcessorBuilder* set_max_buffer_size(std::size_t size) = 0; + + /** + * Call to construct a Processor with the options setup in this builder, + * using the delegate given as parameter. + * May be called multiple times, will create an unique Processor each time. + */ + virtual std::unique_ptr<Processor> build( + std::shared_ptr<Delegate> delegate) const = 0; + + protected: + ProcessorBuilder() = default; + + private: + ProcessorBuilder(ProcessorBuilder const&) = delete; + ProcessorBuilder& operator=(ProcessorBuilder const&) = delete; +}; + +} // namespace sax +} // namespace modxml + +#endif // MODXML_SAX_PROCESSOR_BUILDER_HH diff --git a/sax/meson.build b/sax/meson.build new file mode 100644 index 0000000..ccbdef4 --- /dev/null +++ b/sax/meson.build @@ -0,0 +1,22 @@ +deps = [ + base_dep, + utf_dep, +] + +inc = include_directories('inc') +lib = shared_library( + 'modxmlsax', + 'src/decoder.cc', + 'src/sax_processor.cc', + 'src/sax_processor_builder.cc', + 'src/utils.cc', + dependencies: deps, + include_directories: inc, + install: true, +) + +sax_dep = declare_dependency( + dependencies: deps, + include_directories: inc, + link_with: lib, +) diff --git a/sax/src/decoder.cc b/sax/src/decoder.cc new file mode 100644 index 0000000..30b1735 --- /dev/null +++ b/sax/src/decoder.cc @@ -0,0 +1,321 @@ +#include "decoder.hh" + +#include "macros.hh" +#include "sax_decoder.hh" +#include "utf16.hh" +#include "utf32.hh" +#include "utf8.hh" +#include "utf_error.hh" + +namespace modxml { +namespace sax { + +namespace { + +class UtfDecoder : public Decoder { + public: + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + if (bom_ == -1) UNLIKELY { + std::size_t tmp = in_offset; + uint32_t ret = read(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + if (ret == 0xfeff) { + // To allow offset to advance and to return, we need to + // read at least one more character completely. + ret = read(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + bom_ = 1; + } else { + bom_ = 0; + } + in_offset = tmp; + out[out_offset++] = ret; + if (out_offset == out_size) + return State::GOOD; + } + + do { + uint32_t ret = read(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + return State::GOOD; + } + + protected: + UtfDecoder() = default; + + virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0; + + private: + int8_t bom_{-1}; +}; + +class Utf8Decoder : public UtfDecoder { + public: + Utf8Decoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read8(data, offset); + } +}; + +class Utf16BeDecoder : public UtfDecoder { + public: + Utf16BeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read16be(data, offset); + } +}; + +class Utf16LeDecoder : public UtfDecoder { + public: + Utf16LeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read16le(data, offset); + } +}; + +class Utf32BeDecoder : public UtfDecoder { + public: + Utf32BeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read32be(data, offset); + } +}; + +class Utf32LeDecoder : public UtfDecoder { + public: + Utf32LeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read32le(data, offset); + } +}; + +class Utf16Decoder : public Decoder { + public: + Utf16Decoder() = default; + + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + if (endian_ == -1) UNLIKELY { + std::size_t tmp = in_offset; + uint32_t ret = utf::read16be(in, tmp); + int8_t endian; + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + if (ret == 0xfeff) { + endian = 1; // Big endian + } else if (ret == 0xfffe) { + endian = 0; // Little endian + } else { + return State::INVALID; + } + + // To allow offset to advance and to return, we need to + // read at least one more character completely. + ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + + endian_ = endian; + in_offset = tmp; + out[out_offset++] = ret; + if (out_offset == out_size) + return State::GOOD; + } + + if (endian_ == 1) { + do { + uint32_t ret = utf::read16be(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } else { + do { + uint32_t ret = utf::read16le(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } + return State::GOOD; + } + + private: + int8_t endian_{-1}; +}; + +class Utf32Decoder : public Decoder { + public: + Utf32Decoder() = default; + + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + if (endian_ == -1) UNLIKELY { + std::size_t tmp = in_offset; + uint32_t ret = utf::read32be(in, tmp); + int8_t endian; + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + tmp = in_offset; + ret = utf::read32le(in, tmp); + if (ret == 0xfeff) { + endian = 0; // Little endian + } else { + return State::INVALID; + } + } else if (ret == 0xfeff) { + endian = 1; // Big endian + } else { + return State::INVALID; + } + + // To allow offset to advance and to return, we need to + // read the next character completely. + ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + + endian_ = endian; + in_offset = tmp; + out[out_offset++] = ret; + if (out_offset == out_size) + return State::GOOD; + } + + if (endian_ == 1) { + do { + uint32_t ret = utf::read32be(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } else { + do { + uint32_t ret = utf::read32le(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } + return State::GOOD; + } + + private: + int8_t endian_{-1}; +}; + +class AsciiDecoder : public Decoder { + public: + AsciiDecoder() = default; + + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + do { + if (in_offset == in.size()) + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + if (in[in_offset] & 0x80) + return out_offset > out_start ? State::GOOD : State::INVALID; + out[out_offset++] = in[in_offset++]; + } while (out_offset < out_size); + return State::GOOD; + } +}; + +} // namespace + +std::unique_ptr<Decoder> create_utf8_decoder() { + return std::make_unique<Utf8Decoder>(); +} + +std::unique_ptr<Decoder> create_utf16be_decoder() { + return std::make_unique<Utf16BeDecoder>(); +} + +std::unique_ptr<Decoder> create_utf16le_decoder() { + return std::make_unique<Utf16LeDecoder>(); +} + +std::unique_ptr<Decoder> create_utf32be_decoder() { + return std::make_unique<Utf32BeDecoder>(); +} + +std::unique_ptr<Decoder> create_utf32le_decoder() { + return std::make_unique<Utf32LeDecoder>(); +} + +std::unique_ptr<Decoder> create_utf16_decoder() { + return std::make_unique<Utf16Decoder>(); +} + +std::unique_ptr<Decoder> create_utf32_decoder() { + return std::make_unique<Utf32Decoder>(); +} + +std::unique_ptr<Decoder> create_ascii_decoder() { + return std::make_unique<AsciiDecoder>(); +} + +} // namespace sax +} // namespace modxml + diff --git a/sax/src/decoder.hh b/sax/src/decoder.hh new file mode 100644 index 0000000..bd2a99a --- /dev/null +++ b/sax/src/decoder.hh @@ -0,0 +1,33 @@ +#ifndef DECODER_HH +#define DECODER_HH + +#include "macros.hh" + +#include <memory> + +namespace modxml { +namespace sax { + +class Decoder; + +// UTF-8 with optional BOM +std::unique_ptr<Decoder> HIDDEN create_utf8_decoder(); +// UTF-16 with BOM +std::unique_ptr<Decoder> HIDDEN create_utf16_decoder(); +// UTF-16BE with optional BOM +std::unique_ptr<Decoder> HIDDEN create_utf16be_decoder(); +// UTF-16LE with optional BOM +std::unique_ptr<Decoder> HIDDEN create_utf16le_decoder(); +// UTF-32 with BOM +std::unique_ptr<Decoder> HIDDEN create_utf32_decoder(); +// UTF-32BE with optional BOM +std::unique_ptr<Decoder> HIDDEN create_utf32be_decoder(); +// UTF-32LE with optional BOM +std::unique_ptr<Decoder> HIDDEN create_utf32le_decoder(); +// US-ASCII +std::unique_ptr<Decoder> HIDDEN create_ascii_decoder(); + +} // namespace sax +} // namespace modxml + +#endif // DECODER_HH diff --git a/sax/src/processor.hh b/sax/src/processor.hh new file mode 100644 index 0000000..4a2de29 --- /dev/null +++ b/sax/src/processor.hh @@ -0,0 +1,27 @@ +#ifndef PROCESSOR_HH +#define PROCESSOR_HH + +#include "macros.hh" + +#include <memory> +#include <optional> +#include <string> + +namespace modxml { +namespace sax { + +class DecoderFactory; +class Delegate; +class Processor; + +std::unique_ptr<Processor> HIDDEN create_processor( + std::shared_ptr<Delegate> delegate, + std::shared_ptr<DecoderFactory> decoder_factory, + std::optional<std::string> force_encoding, + std::optional<std::size_t> default_buffer_size, + std::optional<std::size_t> max_buffer_size); + +} // namespace sax +} // namespace modxml + +#endif // PROCESSOR_HH diff --git a/sax/src/sax_processor.cc b/sax/src/sax_processor.cc new file mode 100644 index 0000000..ea9f753 --- /dev/null +++ b/sax/src/sax_processor.cc @@ -0,0 +1,145 @@ +#include "sax_processor.hh" + +#include "sax_decoder.hh" +#include "processor.hh" +#include "utils.hh" + +#include <algorithm> +#include <optional> +#include <utility> + +namespace modxml { +namespace sax { + +namespace { + +// 2.2 Characters +// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + +inline bool valid_char(uint32_t c) { + // Assume valid unicode (U+0 - U+10ffff except surrogate blocks) + if (c >= 0x20 && c <= 0xfffd) + return true; + if (c == 0x9 || c == 0xa || c == 0xd) + return true; + return c >= 0x10000; +} + +// 2.3 Common Syntactic Constructs +// [3] S ::= (#x20 | #x9 | #xD | #xA)+ + +inline bool is_ws(uint32_t c) { + // Assume we already checked for valid_char. + return c <= 0x20; +} + +// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] +// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] + +inline bool is_namestartchar(uint32_t c) { + if (c < 0x41 /* A */) + return c == 0x3a /* : */; + if (c <= 0x5a /* Z */) + return true; + if (c < 0x61 /* a */) + return c == 0x5f /* _ */; + if (c <= 0x7a /* z */) + return true; + if (c < 0xc0) + return false; + if (c < 0x300) + return c != 0xd7 && c != 0xf7; + if (c > 0x37d && c < 0x37f) + return false; + if (c > 0x1fff && c < 0x200c) + return false; + if (c > 0x200d && c < 0x2070) + return false; + if (c > 0x218f && c < 0x2c00) + return false; + if (c > 0x2fef && c < 0x3001) + return false; + // Already valid_char so don't check for surrogate pair here. + if (c > 0xdfff && c < 0xf900) + return false; + if (c > 0xfdcf && c < 0xfdf0) + return false; + if (c > 0xfffd && c < 0x10000) + return false; + return true; +} + +inline bool is_namechar(uint32_t c) { + return is_namestartchar(c) || (c >= 0x2d /* - */ && c <= 0x2e /* . */) || + (c >= 0x30 /* 0 */ && c <= 0x39 /* 9 */) || (c == 0xb7) || + (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040); +} + +/* [5] Name ::= NameStartChar (NameChar)* +[6] Names ::= Name (#x20 Name)* +[7] Nmtoken ::= (NameChar)+ +[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* +*/ + +class ProcessorImpl : public Processor { + public: + ProcessorImpl(std::shared_ptr<Delegate> delegate, + std::shared_ptr<DecoderFactory> decoder_factory, + std::unique_ptr<Decoder> decoder, + std::size_t default_buffer_size, + std::size_t max_buffer_size) + : delegate_(std::move(delegate)), + decoder_factory_(std::move(decoder_factory)), + decoder_(std::move(decoder)), + default_buffer_size_(default_buffer_size), + max_buffer_size_(max_buffer_size) {} + + private: + std::shared_ptr<Delegate> delegate_; + std::shared_ptr<DecoderFactory> decoder_factory_; + std::unique_ptr<Decoder> decoder_; + std::size_t default_buffer_size_; + std::size_t max_buffer_size_; +}; + +} // namespace + +std::unique_ptr<Processor> create_processor( + std::shared_ptr<Delegate> delegate, + std::shared_ptr<DecoderFactory> decoder_factory, + std::optional<std::string> force_encoding, + std::optional<std::size_t> opt_default_buffer_size, + std::optional<std::size_t> opt_max_buffer_size) { + + std::unique_ptr<Decoder> decoder; + if (force_encoding.has_value()) { + decoder = pick_decoder_for_encoding(force_encoding.value(), + decoder_factory.get()); + } + + std::size_t default_buffer_size = 8192; + if (opt_default_buffer_size.has_value()) + default_buffer_size = std::max(static_cast<std::size_t>(128), + opt_default_buffer_size.value()); + // This value is documented in public headers. Do NOT change. + std::size_t max_buffer_size = 10 * 1024 * 1024; + // No validation for user set value. If it is too small MAX_MEMORY_EXCEEDED + // error will be thrown. If it is too large we will get OUT_OF_MEMORY or + // crash depending on platform. + if (opt_max_buffer_size.has_value()) + max_buffer_size = opt_max_buffer_size.value(); + + return std::make_unique<ProcessorImpl>(std::move(delegate), + std::move(decoder_factory), + std::move(decoder), + default_buffer_size, + max_buffer_size); +} + +std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate) { + return create_processor(std::move(delegate), nullptr, + std::nullopt, std::nullopt, std::nullopt); +} + +} // namespace sax +} // namespace modxml diff --git a/sax/src/sax_processor_builder.cc b/sax/src/sax_processor_builder.cc new file mode 100644 index 0000000..8817099 --- /dev/null +++ b/sax/src/sax_processor_builder.cc @@ -0,0 +1,62 @@ +#include "sax_processor_builder.hh" + +#include "processor.hh" +#include "sax_processor.hh" + +#include <optional> +#include <utility> + +namespace modxml { +namespace sax { + +namespace { + +class ProcessorBuilderImpl : public ProcessorBuilder { + public: + ProcessorBuilder* force_encoding(std::string const& str) override { + force_encoding_ = str; + return this; + } + + ProcessorBuilder* custom_decoder_factory( + std::shared_ptr<DecoderFactory> custom_decoder_factory) override { + decoder_factory_ = std::move(custom_decoder_factory); + return this; + } + + ProcessorBuilder* set_default_buffer_size(std::size_t size) override { + default_buffer_size_ = size; + return this; + } + + ProcessorBuilder* set_max_buffer_size(std::size_t size) override { + max_buffer_size_ = size; + return this; + } + + std::unique_ptr<Processor> build( + std::shared_ptr<Delegate> delegate) const override { + return create_processor(std::move(delegate), + decoder_factory_, + force_encoding_, + default_buffer_size_, + max_buffer_size_); + } + + ProcessorBuilderImpl() = default; + + private: + std::shared_ptr<DecoderFactory> decoder_factory_; + std::optional<std::string> force_encoding_; + std::optional<std::size_t> default_buffer_size_; + std::optional<std::size_t> max_buffer_size_; +}; + +} // namespace + +std::unique_ptr<ProcessorBuilder> ProcessorBuilder::create() { + return std::make_unique<ProcessorBuilderImpl>(); +} + +} // namespace sax +} // namespace modxml diff --git a/sax/src/utils.cc b/sax/src/utils.cc new file mode 100644 index 0000000..f0366d5 --- /dev/null +++ b/sax/src/utils.cc @@ -0,0 +1,70 @@ +#include "utils.hh" + +#include "decoder.hh" +#include "sax_decoder.hh" +#include "sax_decoder_factory.hh" + +namespace modxml { +namespace sax { + +namespace { + +std::string cleanup_encoding(std::string const& str) { + std::string ret; + ret.reserve(str.size()); + for (auto c : str) { + if (c >= 'A' && c <= 'Z') { + ret.push_back(c | 0x20); + } else if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) { + ret.push_back(c); + } else if (c == '.' || c == '_' || c == '-') { + ret.push_back('-'); + } + } + return ret; +} + +} // namespace + +// Names inspired by: +// https://www.iana.org/assignments/character-sets/character-sets.xhtml +std::unique_ptr<Decoder> pick_decoder_for_encoding( + std::string const& encoding, DecoderFactory* factory) { + auto clean_enc = cleanup_encoding(encoding); + if (clean_enc == "utf-8" || clean_enc == "utf8") { + return create_utf8_decoder(); + } + if (clean_enc == "utf-16" || clean_enc == "utf16") { + return create_utf16_decoder(); + } + if (clean_enc == "utf-16be" || clean_enc == "utf16be") { + return create_utf16be_decoder(); + } + if (clean_enc == "utf-16le" || clean_enc == "utf16le") { + return create_utf16le_decoder(); + } + if (clean_enc == "utf-32" || clean_enc == "utf32") { + return create_utf32_decoder(); + } + if (clean_enc == "utf-32be" || clean_enc == "utf32be") { + return create_utf32be_decoder(); + } + if (clean_enc == "utf-32le" || clean_enc == "utf32le") { + return create_utf32le_decoder(); + } + if (clean_enc == "ascii" || clean_enc == "us-ascii" || + clean_enc == "usascii" || clean_enc == "iso-ir-6" || + clean_enc == "ansi-x3-4-1968" || clean_enc == "ansi-x3-4-1986" || + clean_enc == "iso-646-irv1991" || clean_enc == "iso646-us" || + clean_enc == "us" || clean_enc == "ibm367" || clean_enc == "cp367") { + return create_ascii_decoder(); + } + if (factory) { + return factory->create(encoding); + } + return nullptr; +} + +} // namespace sax + +} // namespace modxml diff --git a/sax/src/utils.hh b/sax/src/utils.hh new file mode 100644 index 0000000..206d003 --- /dev/null +++ b/sax/src/utils.hh @@ -0,0 +1,22 @@ +#ifndef UTILS_HH +#define UTILS_HH + +#include "macros.hh" + +#include <memory> +#include <string> + +namespace modxml { +namespace sax { + +class Decoder; +class DecoderFactory; + +std::unique_ptr<Decoder> HIDDEN pick_decoder_for_encoding( + std::string const& encoding, + DecoderFactory* factory); + +} // namespace sax +} // namespace modxml + +#endif // UTILS_HH diff --git a/scripts/iwyu.sh b/scripts/iwyu.sh new file mode 100644 index 0000000..dfdd651 --- /dev/null +++ b/scripts/iwyu.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +iwyu_tool=iwyu-tool + +if ! command -v "$iwyu_tool" &> /dev/null; then + iwyu_tool=iwyu_tool.py +fi + +cpus=`grep processor /proc/cpuinfo | wc -l` + +exec $iwyu_tool -o clang -j $cpus -p "${MESON_BUILD_ROOT}" -- -Xiwyu --mapping_file="${MESON_SOURCE_ROOT}"/iwyu-mappings.imp diff --git a/subprojects/.gitignore b/subprojects/.gitignore new file mode 100644 index 0000000..847cb91 --- /dev/null +++ b/subprojects/.gitignore @@ -0,0 +1,2 @@ +googletest-1.13.0/ +packagecache/ diff --git a/subprojects/gtest.wrap b/subprojects/gtest.wrap new file mode 100644 index 0000000..195aaca --- /dev/null +++ b/subprojects/gtest.wrap @@ -0,0 +1,15 @@ +[wrap-file] +directory = googletest-1.13.0 +source_url = https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz +source_filename = gtest-1.13.0.tar.gz +source_hash = ad7fdba11ea011c1d925b3289cf4af2c66a352e18d4c7264392fead75e919363 +patch_filename = gtest_1.13.0-1_patch.zip +patch_url = https://wrapdb.mesonbuild.com/v2/gtest_1.13.0-1/get_patch +patch_hash = 6d82a02c3a45071cea989983bf6becde801cbbfd29196ba30dada0215393b082 +wrapdb_version = 1.13.0-1 + +[provide] +gtest = gtest_dep +gtest_main = gtest_main_dep +gmock = gmock_dep +gmock_main = gmock_main_dep diff --git a/utf/inc/utf16.hh b/utf/inc/utf16.hh new file mode 100644 index 0000000..344b1a2 --- /dev/null +++ b/utf/inc/utf16.hh @@ -0,0 +1,31 @@ +#ifndef UTF_UTF16_HH +#define UTF_UTF16_HH + +#include "macros.hh" + +#include <cstdint> +#include <string_view> + +namespace utf { + +/* Read one unicode codepoint from UTF-16 BigEndian encoded data if possible. + * If successfull offset is incremented to point to next codepoint. + * Will fail: + * - not enough data is left in data given offset, returns NEED_MORE. + * - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs, + * returns INVALID. + */ +uint32_t HIDDEN read16be(std::string_view data, std::size_t& offset); + +/* Read one unicode codepoint from UTF-16 LittleEndian encoded data if possible. + * If successfull offset is incremented to point to next codepoint. + * Will fail: + * - not enough data is left in data given offset, returns NEED_MORE. + * - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs, + * returns INVALID. + */ +uint32_t HIDDEN read16le(std::string_view data, std::size_t& offset); + +} // namespace utf + +#endif // UTF_UTF16_HH diff --git a/utf/inc/utf32.hh b/utf/inc/utf32.hh new file mode 100644 index 0000000..2d3088e --- /dev/null +++ b/utf/inc/utf32.hh @@ -0,0 +1,29 @@ +#ifndef UTF_UTF32_HH +#define UTF_UTF32_HH + +#include "macros.hh" + +#include <cstdint> +#include <string_view> + +namespace utf { + +/* Read one unicode codepoint from UTF-32 BigEndian encoded data if possible. + * If successfull offset is incremented to point to next codepoint. + * Will fail: + * - not enough data is left in data given offset, returns NEED_MORE. + * - data is not valid UTF-32, ie. outside valid ranges, returns INVALID. + */ +uint32_t HIDDEN read32be(std::string_view data, std::size_t& offset); + +/* Read one unicode codepoint from UTF-32 LittleEndian encoded data if possible. + * If successfull offset is incremented to point to next codepoint. + * Will fail: + * - not enough data is left in data given offset, returns NEED_MORE. + * - data is not valid UTF-32, ie. outside valid ranges, returns INVALID. + */ +uint32_t HIDDEN read32le(std::string_view data, std::size_t& offset); + +} // namespace utf + +#endif // UTF_UTF32_HH diff --git a/utf/inc/utf8.hh b/utf/inc/utf8.hh new file mode 100644 index 0000000..a3ea84a --- /dev/null +++ b/utf/inc/utf8.hh @@ -0,0 +1,22 @@ +#ifndef UTF_UTF8_HH +#define UTF_UTF8_HH + +#include "macros.hh" + +#include <cstdint> +#include <string_view> + +namespace utf { + +/* Read one unicode codepoint from UTF-8 encoded data if possible. + * If successfull offset is incremented to point to next codepoint. + * Will fail: + * - not enough data is left in data given offset, returns NEED_MORE. + * - data is not valid UTF-8, this includes overlong encodings and + * invalid unicode code points, returns INVALID. + */ +uint32_t HIDDEN read8(std::string_view data, std::size_t& offset); + +} // namespace utf + +#endif // UTF_UTF8_HH diff --git a/utf/inc/utf_error.hh b/utf/inc/utf_error.hh new file mode 100644 index 0000000..079fa43 --- /dev/null +++ b/utf/inc/utf_error.hh @@ -0,0 +1,13 @@ +#ifndef UTF_ERROR_HH +#define UTF_ERROR_HH + +#include <cstdint> + +namespace utf { + +constexpr uint32_t NEED_MORE = 0xfffffffe; +constexpr uint32_t INVALID = 0xffffffff; + +} // namespace utf + +#endif // UTF_ERROR_HH diff --git a/utf/meson.build b/utf/meson.build new file mode 100644 index 0000000..64db6ff --- /dev/null +++ b/utf/meson.build @@ -0,0 +1,38 @@ +deps = [ + base_dep, +] + +inc = include_directories('inc') +lib = static_library( + 'utf', + 'src/utf8.cc', + 'src/utf16.cc', + 'src/utf32.cc', + dependencies: deps, + include_directories: inc, + install: false, +) + +utf_dep = declare_dependency( + dependencies: deps, + include_directories: inc, + link_with: lib, +) + +test('utf8', + executable( + 'test_utf8', + sources: ['tst/test_utf8.cc'], + dependencies: [utf_dep, gtest_dep])) + +test('utf16', + executable( + 'test_utf16', + sources: ['tst/test_utf16.cc'], + dependencies: [utf_dep, gtest_dep])) + +test('utf32', + executable( + 'test_utf32', + sources: ['tst/test_utf32.cc'], + dependencies: [utf_dep, gtest_dep])) diff --git a/utf/src/utf16.cc b/utf/src/utf16.cc new file mode 100644 index 0000000..43595bf --- /dev/null +++ b/utf/src/utf16.cc @@ -0,0 +1,67 @@ +#include "utf16.hh" + +#include "utf_error.hh" + +namespace utf { + +namespace { + +inline bool is_high_surrogate(uint16_t c) { + return c >= 0xd800 && c <= 0xdbff; +} + +inline bool is_low_surrogate(uint16_t c) { + return c >= 0xdc00 && c <= 0xdfff; +} + +} // namespace + +uint32_t read16be(std::string_view data, std::size_t& offset) { + if (offset > data.size() || data.size() - offset < 2) + return NEED_MORE; + uint16_t c = static_cast<uint16_t>(data[offset]) << 8 + | static_cast<uint16_t>(data[offset + 1] & 0xff); + if (is_high_surrogate(c)) { + if (data.size() - offset < 4) + return NEED_MORE; + uint16_t d = static_cast<uint16_t>(data[offset + 2]) << 8 + | static_cast<uint16_t>(data[offset + 3] & 0xff); + if (is_low_surrogate(d)) { + offset += 4; + return 0x10000 + + (static_cast<uint32_t>(c & 0x3ff) << 10 + | (d & 0x3ff)); + } + return INVALID; + } else if (is_low_surrogate(c)) { + return INVALID; + } + offset += 2; + return c; +} + +uint32_t read16le(std::string_view data, std::size_t& offset) { + if (offset > data.size() || data.size() - offset < 2) + return NEED_MORE; + uint16_t c = static_cast<uint16_t>(data[offset + 1]) << 8 + | static_cast<uint16_t>(data[offset] & 0xff); + if (is_high_surrogate(c)) { + if (data.size() - offset < 4) + return NEED_MORE; + uint16_t d = static_cast<uint16_t>(data[offset + 3]) << 8 + | static_cast<uint16_t>(data[offset + 2] & 0xff); + if (is_low_surrogate(d)) { + offset += 4; + return 0x10000 + + (static_cast<uint32_t>(c & 0x3ff) << 10 + | (d & 0x3ff)); + } + return INVALID; + } else if (is_low_surrogate(c)) { + return INVALID; + } + offset += 2; + return c; +} + +} // namespace utf diff --git a/utf/src/utf32.cc b/utf/src/utf32.cc new file mode 100644 index 0000000..cfa29b6 --- /dev/null +++ b/utf/src/utf32.cc @@ -0,0 +1,43 @@ +#include "utf32.hh" + +#include "utf_error.hh" + +namespace utf { + +namespace { + +inline bool valid_codepoint(uint32_t c) { + return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff); +} + +} // namespace + +uint32_t read32be(std::string_view data, std::size_t& offset) { + if (offset > data.size() || data.size() - offset < 4) + return NEED_MORE; + uint32_t c = static_cast<uint32_t>(data[offset]) << 24 + | static_cast<uint32_t>(data[offset + 1] & 0xff) << 16 + | static_cast<uint32_t>(data[offset + 2] & 0xff) << 8 + | static_cast<uint32_t>(data[offset + 3] & 0xff); + if (valid_codepoint(c)) { + offset += 4; + return c; + } + return INVALID; +} + +uint32_t read32le(std::string_view data, std::size_t& offset) { + if (offset > data.size() || data.size() - offset < 4) + return NEED_MORE; + uint32_t c = static_cast<uint32_t>(data[offset + 3]) << 24 + | static_cast<uint32_t>(data[offset + 2] & 0xff) << 16 + | static_cast<uint32_t>(data[offset + 1] & 0xff) << 8 + | static_cast<uint32_t>(data[offset] & 0xff); + if (valid_codepoint(c)) { + offset += 4; + return c; + } + return INVALID; +} + +} // namespace utf diff --git a/utf/src/utf8.cc b/utf/src/utf8.cc new file mode 100644 index 0000000..54b0296 --- /dev/null +++ b/utf/src/utf8.cc @@ -0,0 +1,68 @@ +#include "utf8.hh" + +#include "utf_error.hh" + +namespace utf { + +namespace { + +inline bool valid_codepoint(uint32_t c) { + return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff); +} + +} // namespace + +uint32_t read8(std::string_view data, std::size_t& offset) { + if (offset >= data.size()) + return NEED_MORE; + uint32_t ret; + uint8_t size; + switch (static_cast<uint8_t>(data[offset]) >> 4) { + case 15: + if (data[offset] & 0x08) + return INVALID; + ret = static_cast<uint32_t>(data[offset] & 0x07) << 18; + size = 4; + break; + case 14: + ret = static_cast<uint32_t>(data[offset] & 0x0f) << 12; + size = 3; + break; + case 13: + case 12: + ret = static_cast<uint32_t>(data[offset] & 0x1f) << 6; + size = 2; + break; + default: + if (data[offset] & 0x80) + return INVALID; + return data[offset++]; + } + if (data.size() - offset < size) + return NEED_MORE; + for (uint8_t i = 1; i < size; ++i) { + if ((data[offset + i] & 0xc0) != 0x80) + return INVALID; + ret |= static_cast<uint32_t>(data[offset + i] & 0x3f) << (size - i - 1) * 6; + } + if (!valid_codepoint(ret)) + return INVALID; + switch (size) { + case 4: + if (ret < 0x10000) + return INVALID; + break; + case 3: + if (ret < 0x800) + return INVALID; + break; + case 2: + if (ret < 0x80) + return INVALID; + break; + } + offset += size; + return ret; +} + +} // namespace utf diff --git a/utf/tst/test_utf16.cc b/utf/tst/test_utf16.cc new file mode 100644 index 0000000..c17982e --- /dev/null +++ b/utf/tst/test_utf16.cc @@ -0,0 +1,157 @@ +#include "utf16.hh" + +#include "utf_error.hh" + +#include <gtest/gtest.h> + +TEST(utf16be, sanity) { + std::string_view str("\x00\x24", 2); + size_t offset = 0; + auto ret = utf::read16be(str, offset); + EXPECT_EQ('$', ret); + EXPECT_EQ(2, offset); + + str = "\x20\xAC"; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(0x20AC, ret); + EXPECT_EQ(2, offset); + + str = "\xD8\x01\xDC\x37"; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(0x10437, ret); + EXPECT_EQ(4, offset); + + str = "\xD8\x52\xDF\x62"; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(0x24B62, ret); + EXPECT_EQ(4, offset); +} + +TEST(utf16le, sanity) { + std::string_view str("\x24\x00", 2); + size_t offset = 0; + auto ret = utf::read16le(str, offset); + EXPECT_EQ('$', ret); + EXPECT_EQ(2, offset); + + str = "\xAC\x20"; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(0x20AC, ret); + EXPECT_EQ(2, offset); + + str = "\x01\xD8\x37\xDC"; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(0x10437, ret); + EXPECT_EQ(4, offset); + + str = "\x52\xD8\x62\xDF"; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(0x24B62, ret); + EXPECT_EQ(4, offset); +} + +TEST(utf16be, bom) { + std::string_view str("\xFE\xFF\x20\xAC"); + size_t offset = 0; + auto ret = utf::read16be(str, offset); + EXPECT_EQ(0xFEFF, ret); + ret = utf::read16be(str, offset); + EXPECT_EQ(0x20AC, ret); + ret = utf::read16be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(str.size(), offset); +} + +TEST(utf16le, bom) { + std::string_view str("\xFF\xFE\xAC\x20"); + size_t offset = 0; + auto ret = utf::read16le(str, offset); + EXPECT_EQ(0xFEFF, ret); + ret = utf::read16le(str, offset); + EXPECT_EQ(0x20AC, ret); + ret = utf::read16le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(str.size(), offset); +} + +TEST(utf16be, invalid) { + std::string_view str("\xD8"); + size_t offset = 0; + auto ret = utf::read16be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = ""; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\xD8\x01"; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\xD8\x01\xDC"; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\xDC\x37\xD8\x01"; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\xD8\x01\xD8\x01"; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); +} + +TEST(utf16le, invalid) { + std::string_view str("\x01"); + size_t offset = 0; + auto ret = utf::read16le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = ""; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\x01\xD8"; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\x01\xD8\x37"; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\x37\xDC\x01\xD8"; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\x01\xD8\x01\xD8"; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); +} diff --git a/utf/tst/test_utf32.cc b/utf/tst/test_utf32.cc new file mode 100644 index 0000000..796b4cd --- /dev/null +++ b/utf/tst/test_utf32.cc @@ -0,0 +1,145 @@ +#include "utf32.hh" + +#include "utf_error.hh" + +#include <gtest/gtest.h> + +TEST(utf32be, sanity) { + std::string_view str("\x00\x00\x00\x24", 4); + size_t offset = 0; + auto ret = utf::read32be(str, offset); + EXPECT_EQ('$', ret); + EXPECT_EQ(4, offset); + + str = std::string_view("\x00\x00\x20\xAC", 4); + offset = 0; + ret = utf::read32be(str, offset); + EXPECT_EQ(0x20AC, ret); + EXPECT_EQ(4, offset); + + str = std::string_view("\x00\x01\x04\x37", 4); + offset = 0; + ret = utf::read32be(str, offset); + EXPECT_EQ(0x10437, ret); + EXPECT_EQ(4, offset); +} + +TEST(utf32le, sanity) { + std::string_view str("\x24\x00\x00\x00", 4); + size_t offset = 0; + auto ret = utf::read32le(str, offset); + EXPECT_EQ('$', ret); + EXPECT_EQ(4, offset); + + str = std::string_view("\xAC\x20\x00\x00", 4); + offset = 0; + ret = utf::read32le(str, offset); + EXPECT_EQ(0x20AC, ret); + EXPECT_EQ(4, offset); + + str = std::string_view("\x37\x04\x01\x00", 4); + offset = 0; + ret = utf::read32le(str, offset); + EXPECT_EQ(0x10437, ret); + EXPECT_EQ(4, offset); +} + +TEST(utf32be, invalid) { + std::string_view str("\xFF\xFF\xFF\xFF"); + size_t offset = 0; + auto ret = utf::read32be(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00\x00\xD8\x00", 4); + offset = 0; + ret = utf::read32be(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = ""; + offset = 0; + ret = utf::read32be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00", 1); + offset = 0; + ret = utf::read32be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00\x00", 2); + offset = 0; + ret = utf::read32be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00\x00\x00", 3); + offset = 0; + ret = utf::read32be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); +} + +TEST(utf32le, invalid) { + std::string_view str("\xFF\xFF\xFF\xFF"); + size_t offset = 0; + auto ret = utf::read32le(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00\xD8\x00\x00", 4); + offset = 0; + ret = utf::read32le(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = ""; + offset = 0; + ret = utf::read32le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00", 1); + offset = 0; + ret = utf::read32le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00\x00", 2); + offset = 0; + ret = utf::read32le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00\x00\x00", 3); + offset = 0; + ret = utf::read32le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); +} + +TEST(utf32be, bom) { + std::string_view str("\x00\x00\xFF\xFE\x00\x00\x20\xAC", 8); + size_t offset = 0; + auto ret = utf::read32be(str, offset); + EXPECT_EQ(0xFFFE, ret); + ret = utf::read32be(str, offset); + EXPECT_EQ(0x20AC, ret); + ret = utf::read32be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(str.size(), offset); +} + +TEST(utf32le, bom) { + std::string_view str("\xFE\xFF\x00\x00\xAC\x20\x00\x00", 8); + size_t offset = 0; + auto ret = utf::read32le(str, offset); + EXPECT_EQ(0xFFFE, ret); + ret = utf::read32le(str, offset); + EXPECT_EQ(0x20AC, ret); + ret = utf::read32le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(str.size(), offset); +} diff --git a/utf/tst/test_utf8.cc b/utf/tst/test_utf8.cc new file mode 100644 index 0000000..10df969 --- /dev/null +++ b/utf/tst/test_utf8.cc @@ -0,0 +1,188 @@ +#include "utf8.hh" + +#include "utf_error.hh" + +#include <gtest/gtest.h> + +TEST(utf8, sanity) { + std::string_view str("$"); + size_t offset = 0; + auto ret = utf::read8(str, offset); + EXPECT_EQ('$', ret); + EXPECT_EQ(1, offset); + + str = "\xC2\xA3"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(0xa3, ret); + EXPECT_EQ(2, offset); + + str = "\xD0\x98"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(0x418, ret); + EXPECT_EQ(2, offset); + + str = "\xE0\xA4\xB9"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(0x939, ret); + EXPECT_EQ(3, offset); + + str = "\xE2\x82\xAC"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(0x20AC, ret); + EXPECT_EQ(3, offset); + + str = "\xED\x95\x9C"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(0xD55C, ret); + EXPECT_EQ(3, offset); + + str = "\xF0\x90\x8D\x88"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(0x10348, ret); + EXPECT_EQ(4, offset); +} + +TEST(utf8, overlong) { + std::string_view str("\xF0\x82\x82\xAC"); + size_t offset = 0; + auto ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\xE0\x81\x81"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\xC0\x80"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); +} + +TEST(utf8, invalid) { + std::string_view str("\xED\xB0\x80"); + size_t offset = 0; + auto ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\xFB\xFF\xFF"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\xFF\xFF\xFF\xFF\xFF"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = ""; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\x80"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\xC2"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\xC2\x03"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\xE0\xA4"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\xF0\x90\x8D"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); +} + +TEST(utf8, multiple1) { + std::string_view str("\x4D\xC3\xAC\x6E\x68\x20\x6E\xC3\xB3\x69\x20\x74\x69" + "\xE1\xBA\xBF\x6E\x67\x20\x56\x69\xE1\xBB\x87\x74"); + size_t offset = 0; + auto ret = utf::read8(str, offset); + EXPECT_EQ('M', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(0xEC, ret); + ret = utf::read8(str, offset); + EXPECT_EQ('n', ret); + ret = utf::read8(str, offset); + EXPECT_EQ('h', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(' ', ret); + ret = utf::read8(str, offset); + EXPECT_EQ('n', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(0xF3, ret); + ret = utf::read8(str, offset); + EXPECT_EQ('i', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(' ', ret); + ret = utf::read8(str, offset); + EXPECT_EQ('t', ret); + ret = utf::read8(str, offset); + EXPECT_EQ('i', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(0x1EBF, ret); + ret = utf::read8(str, offset); + EXPECT_EQ('n', ret); + ret = utf::read8(str, offset); + EXPECT_EQ('g', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(' ', ret); + ret = utf::read8(str, offset); + EXPECT_EQ('V', ret); + ret = utf::read8(str, offset); + EXPECT_EQ('i', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(0x1EC7, ret); + ret = utf::read8(str, offset); + EXPECT_EQ('t', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(str.size(), offset); +} + +TEST(utf8, multiple2) { + std::string_view str("\xF0\xA8\x89\x9F\xE5\x91\x90\xE3\x97\x82\xE8\xB6\x8A"); + size_t offset = 0; + auto ret = utf::read8(str, offset); + EXPECT_EQ(0x2825F, ret); + ret = utf::read8(str, offset); + EXPECT_EQ(0x5450, ret); + ret = utf::read8(str, offset); + EXPECT_EQ(0x35C2, ret); + ret = utf::read8(str, offset); + EXPECT_EQ(0x8D8A, ret); + ret = utf::read8(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(str.size(), offset); +} |
