diff options
| author | Joel Klinghed <the_jk@spawned.biz> | 2024-01-21 12:31:30 +0100 |
|---|---|---|
| committer | Joel Klinghed <the_jk@spawned.biz> | 2024-01-21 12:31:30 +0100 |
| commit | 7dd49c6293172b494c78918507242cdb55d35137 (patch) | |
| tree | 9c8ab822ab9501a5ea2f937e609144e00ea091c4 | |
| parent | fc4547b412e28164af1bf8981234c6af959ccc0b (diff) | |
WIP
31 files changed, 2928 insertions, 414 deletions
diff --git a/base/meson.build b/base/meson.build index 71faace..7668487 100644 --- a/base/meson.build +++ b/base/meson.build @@ -8,7 +8,6 @@ if cpp.compiles('''int foo() { return 0; }''', name: 'C++20 unlikely attribute') cpp_flags += '-DHAVE_ATTRIBUTE_UNLIKELY' - cpp_flags += '-Wno-c++20-attribute-extensions' endif inc = include_directories('inc') diff --git a/meson.build b/meson.build index 2d571dc..d8a9641 100644 --- a/meson.build +++ b/meson.build @@ -4,7 +4,7 @@ project( meson_version: '>= 0.58', default_options : [ 'warning_level=3', - 'cpp_std=c++17', + 'cpp_std=c++20', 'cpp_rtti=false', 'cpp_eh=none', 'b_ndebug=if-release', diff --git a/sax/inc/sax_attributes.hh b/sax/inc/sax_attributes.hh new file mode 100644 index 0000000..4ab1a44 --- /dev/null +++ b/sax/inc/sax_attributes.hh @@ -0,0 +1,146 @@ +#ifndef SAX_ATTRIBUTES_HH +#define SAX_ATTRIBUTES_HH + +#include <iterator> +#include <optional> +#include <string_view> + +namespace modxml { +namespace sax { + +struct Attribute { + std::string_view name; + std::string_view value; + + Attribute(std::string_view name, std::string_view value); +}; + +/** + * A view of attributes, with utility functions. + */ +class Attributes { + public: + virtual ~Attributes() = default; + + class iterator { + public: + using iterator_category = std::random_access_iterator_tag; + using difference_type = std::ptrdiff_t; + using element_type = Attribute; + using pointer = element_type const *; + using reference = element_type const &; + + iterator() + : attributes_(nullptr), index_(0) {} + iterator(iterator const& it) + : attributes_(it.attributes_), index_(it.index_) {} + iterator& operator=(iterator const& it) { + attributes_ = it.attributes_; + index_ = it.index_; + return *this; + } + + /** + * Comparing two iterators from different Attributes instances is undefined. + */ + bool operator==(iterator const& it) const { + return index_ == it.index_; + } + std::strong_ordering operator<=>(iterator const& it) const { + return index_ <=> it.index_; + } + + pointer operator->() const { return &attributes_->at(index_); } + reference operator*() const { return attributes_->at(index_); } + reference operator[](difference_type i) const { + return attributes_->at(index_ + i); + } + + iterator& operator++() { + ++index_; + return *this; + } + iterator operator++(int) { + auto ret = *this; + ++index_; + return ret; + } + iterator& operator+=(difference_type i) { + index_ += i; + return *this; + } + iterator operator+(difference_type i) const { + return iterator(attributes_, index_ + i); + } + friend iterator operator+(difference_type i, iterator const &it) { + return iterator(it.attributes_, it.index_ + i); + } + iterator& operator--() { + --index_; + return *this; + } + iterator operator--(int) { + auto ret = *this; + --index_; + return ret; + } + iterator& operator-=(difference_type i) { + index_ -= i; + return *this; + } + difference_type operator-(iterator const& it) const { + return index_ - it.index_; + } + iterator operator-(difference_type i) const { + return iterator(attributes_, index_ - i); + } + + protected: + iterator(Attributes const* attributes, std::size_t index) + : attributes_(attributes), index_(index) {} + + private: + Attributes const* attributes_; + std::size_t index_; + }; + + static_assert(std::random_access_iterator<iterator>); + + virtual iterator begin() const = 0; + virtual iterator end() const = 0; + + virtual std::size_t size() const = 0; + /** + * name and value of attribute are valid as long as Attributes instance is. + */ + virtual Attribute const& at(std::size_t index) const = 0; + + Attribute const& operator[](std::size_t index) const { return at(index); } + + /** + * Return the first attribute with name, if any. + */ + virtual std::optional<std::string_view> find_first( + std::string_view name) const; + + /** + * Return the last attribute with name, if any. + */ + virtual std::optional<std::string_view> find_last( + std::string_view name) const; + + /** + * Return the index of the attribute with name, starting with offset. + */ + virtual std::optional<std::size_t> find(std::string_view name, + std::size_t index = 0) const; + + protected: + Attributes() = default; +}; + +} // namespace sax +} // namespace modxml + + +#endif // SAX_ATTRIBUTES_HH diff --git a/sax/inc/sax_decoder.hh b/sax/inc/sax_decoder.hh index 40a56c9..8b2490c 100644 --- a/sax/inc/sax_decoder.hh +++ b/sax/inc/sax_decoder.hh @@ -1,16 +1,15 @@ #ifndef SAX_DECODER_HH #define SAX_DECODER_HH -#include <memory> -#include <string> -#include <string_view> +#include <cstdint> +#include <span> namespace modxml { namespace sax { /** * Decoder returned by DecoderFactory. Used by Processor to turn bytes into - * unicode characters. + * unicode characters encoded as UTF-8. */ class Decoder { public: @@ -18,9 +17,9 @@ class Decoder { enum class State { GOOD = 0, - // too little data was given to advance + // too little data was given to decode NEED_MORE, - // invalid data was given to advance + // invalid data was given to decode INVALID, }; @@ -29,23 +28,22 @@ class Decoder { * write them to out (start at out_offset) as UTF-8. * All written code points must be valid per Unicode, so inside the * range U+0 to U+10FFFF and not a surrogate pair (U+D800-U+DFFF). - * No partial output, only write to out if the whole UTF-8 sequence is - * going to fit. - * The is always at least 4 bytes available (out.size() - out_offset) when + * No partial code point output, only write to out if the whole UTF-8 + * sequence for the code point is going to fit. + * There will always at least 4 bytes available (out.size() - out_offset) when * called. - * Advance in_offset for data consumed. + * Advance in_offset for data consumed. Do NOT read past in.size(). * Advance out_offset for code points written. Do NOT write past out.size(). - * Do NOT resize out. * If at least one code point is decoded and written to out, return GOOD. * If it is not possible to decode a single code point, in_offset and * out_offset should not be advanced and something other than GOOD returned. * Do not keep any references to any of the parameters after returning, next - * advance() call will point to the following bytes, but all parameters + * decode() call will point to the following bytes, but all parameters * may have changed as they are subject to the buffer implementations of the * Processor. */ - virtual State decode(std::string_view in, std::size_t& in_offset, - std::string& out, std::size_t& out_offset) = 0; + virtual State decode(std::span<uint8_t const> in, std::size_t& in_offset, + std::span<uint8_t> out, std::size_t& out_offset) = 0; protected: Decoder() = default; diff --git a/sax/inc/sax_decoder_factory.hh b/sax/inc/sax_decoder_factory.hh index 80f1af3..2361ac3 100644 --- a/sax/inc/sax_decoder_factory.hh +++ b/sax/inc/sax_decoder_factory.hh @@ -2,7 +2,7 @@ #define SAX_DECODER_FACTORY_HH #include <memory> -#include <string> +#include <string_view> namespace modxml { namespace sax { @@ -23,7 +23,7 @@ class DecoderFactory { * Note that encoding value isn't cleaned up or validated in any way, it is * reported EXACTLY as found (even if not valid per XML spec). */ - virtual std::unique_ptr<Decoder> create(std::string const& encoding) = 0; + virtual std::unique_ptr<Decoder> create(std::string_view encoding) = 0; protected: DecoderFactory() = default; diff --git a/sax/inc/sax_delegate.hh b/sax/inc/sax_delegate.hh index ba63e72..59af2b7 100644 --- a/sax/inc/sax_delegate.hh +++ b/sax/inc/sax_delegate.hh @@ -1,9 +1,14 @@ #ifndef MODXML_SAX_DELEGATE_HH #define MODXML_SAX_DELEGATE_HH +#include <cstdint> +#include <string_view> + namespace modxml { namespace sax { +class Attributes; + /** * Delegate for processor. * Implement to handle events. @@ -12,6 +17,23 @@ class Delegate { public: virtual ~Delegate() = default; + virtual void start_element(std::string_view name, + Attributes const& attributes); + + virtual void end_element(std::string_view name); + + virtual void empty_element(std::string_view name, + Attributes const& attributes); + + virtual void character_data(std::string_view data); + + virtual void processing_instruction(std::string_view target, + std::string_view data); + + virtual void comment(std::string_view data); + + virtual void error(std::string_view message); + protected: Delegate() = default; }; diff --git a/sax/inc/sax_processor.hh b/sax/inc/sax_processor.hh index 7ca32f7..cf53807 100644 --- a/sax/inc/sax_processor.hh +++ b/sax/inc/sax_processor.hh @@ -2,6 +2,7 @@ #define MODXML_SAX_PROCESSOR_HH #include <memory> +#include <span> namespace modxml { namespace sax { @@ -23,6 +24,23 @@ class Processor { */ static std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate); + /** + * Process (consume) input data. + * Returns bytes consumed, can be zero. + */ + virtual std::size_t process(std::span<uint8_t const> data, + std::size_t offset = 0) = 0; + + /** + * When called from delegate, points to the start of the element that + * triggered the callback. + * When called otherwise, points to the last element that was processed. + * Lines start at 1. + * Columns start at 0. + */ + virtual uint64_t line() const = 0; + virtual uint64_t column() const = 0; + protected: Processor() = default; diff --git a/sax/inc/sax_processor_builder.hh b/sax/inc/sax_processor_builder.hh index 070fbbf..8b114e4 100644 --- a/sax/inc/sax_processor_builder.hh +++ b/sax/inc/sax_processor_builder.hh @@ -48,7 +48,7 @@ class ProcessorBuilder { * If you give a too small buffer size (such as zero) it will be ignored * and a implementation specific minimum will be used instead. * This is meant as a possible optimization and can be completely ignored. - * Note that the processor will allocate more data if it needed. + * Note that the processor will allocate more data if it needs to. */ virtual ProcessorBuilder* set_default_buffer_size(std::size_t size) = 0; diff --git a/sax/meson.build b/sax/meson.build index ccbdef4..8797c41 100644 --- a/sax/meson.build +++ b/sax/meson.build @@ -6,7 +6,11 @@ deps = [ inc = include_directories('inc') lib = shared_library( 'modxmlsax', + 'src/buffer.cc', 'src/decoder.cc', + 'src/guessing_decoder.cc', + 'src/sax_attributes.cc', + 'src/sax_delegate.cc', 'src/sax_processor.cc', 'src/sax_processor_builder.cc', 'src/utils.cc', @@ -20,3 +24,19 @@ sax_dep = declare_dependency( include_directories: inc, link_with: lib, ) + +test('buffer', + executable( + 'test_buffer', + sources: [ + 'src/buffer.cc', + 'tst/test_buffer.cc', + ], + include_directories: 'src', + dependencies: [base_dep, gmock_dep, gtest_dep])) + +test('decoder', + executable( + 'test_decoder', + sources: ['tst/test_decoder.cc'], + dependencies: [sax_dep, gtest_dep])) diff --git a/sax/src/buffer.cc b/sax/src/buffer.cc new file mode 100644 index 0000000..964865d --- /dev/null +++ b/sax/src/buffer.cc @@ -0,0 +1,398 @@ +#include "buffer.hh" + +#include <algorithm> +#include <cassert> +#include <memory> +#include <limits> + +namespace modxml { +namespace sax { + +namespace { + +class DynamicBuffer : public Buffer { + public: + DynamicBuffer(std::size_t default_size, std::size_t max_size) + : default_size_(std::min(default_size, max_size)), max_size_(max_size), + data_(std::make_unique_for_overwrite<uint8_t[]>(default_size_)), + size_(default_size_) {} + + std::span<uint8_t> wspan(std::size_t need) override { + auto avail = size_ - (offset_ + fill_); + if (need > avail) { + if (max_size_ - fill_ < need) // Early exit if need is never possible + return {}; + if (offset_ > 0) { + std::copy_n(data_.get() + offset_, fill_, data_.get()); + offset_ = 0; + } + avail = size_ - fill_; + if (need > avail) { + auto const max = std::numeric_limits<std::size_t>::max() / 2; + std::size_t new_size = size_; + while (true) { + if (new_size <= max) { + new_size *= 2; + } else { + new_size = std::numeric_limits<std::size_t>::max(); + } + if (new_size >= max_size_) { + new_size = max_size_; + break; + } + if (new_size - fill_ >= need) + break; + } + // Using new as it has std::nothrow which make_unique lacks. + // Easy enought to keep track of the pointers here anyway. + auto* tmp = new(std::nothrow) uint8_t[new_size]; + if (tmp == nullptr) + return {}; + std::copy_n(data_.get(), fill_, tmp); + size_ = new_size; + data_.reset(tmp); + } + } + return {data_.get() + offset_ + fill_, size_ - (offset_ + fill_)}; + } + + void commit(std::size_t size) override { + assert(size_ - (offset_ + fill_) >= size); + fill_ += size; + } + + std::span<uint8_t const> rspan(std::size_t) override { + return {data_.get() + offset_, fill_}; + } + + void consume(std::size_t size) override { + if (size == 0) + return; + assert(fill_ >= size); + fill_ -= size; + if (fill_ == 0) { + reset(); + } else { + offset_ += size; + } + } + + std::span<uint8_t> mspan(std::size_t) override { + return {data_.get() + offset_, fill_}; + } + + std::size_t uncommit(std::size_t size) override { + auto ret = std::min(size, fill_); + fill_ -= ret; + if (fill_ == 0) { + reset(); + } + return ret; + } + + bool empty() const override { + return fill_ == 0; + } + + bool full() const override { + return fill_ >= max_size_; + } + + void reset() override { + if (size_ != default_size_) + data_ = std::make_unique_for_overwrite<uint8_t[]>(size_ = default_size_); + offset_ = 0; + fill_ = 0; + } + + private: + std::size_t const default_size_; + std::size_t const max_size_; + std::unique_ptr<uint8_t[]> data_; + std::size_t size_; + std::size_t offset_{0}; + std::size_t fill_{0}; +}; + +class FixedBuffer : public Buffer { + public: + explicit FixedBuffer(std::size_t size) + : size_(size), data_(std::make_unique<uint8_t[]>(size_)) {} + + std::span<uint8_t> wspan(std::size_t need) override { + auto avail = wavail(); + if (need > avail) { + if (need > size_ - ravail()) // Early exit if need will never fit + return {}; + if (rptr_ < wptr_ || (rptr_ == wptr_ && !full_)) { + rotate(); + avail = wavail(); + } else { + return {}; + } + } + return {data_.get() + wptr_, avail}; + } + + void commit(std::size_t size) override { + if (size == 0) + return; + assert(wavail() >= size); + wptr_ += size; + if (wptr_ == size_) + wptr_ = 0; + if (rptr_ == wptr_) + full_ = true; + } + + std::span<uint8_t const> rspan(std::size_t want) override { + return mspan(want); + } + + void consume(std::size_t size) override { + if (size == 0) + return; + assert(ravail() >= size); + full_ = false; + rptr_ += size; + if (rptr_ == size_) + rptr_ = 0; + if (rptr_ == wptr_) + reset(); + } + + std::span<uint8_t> mspan(std::size_t want) override { + auto avail = ravail(); + if (want > avail) { + if (rptr_ > wptr_ || (rptr_ == wptr_ && full_)) { + rotate(); + avail = ravail(); + } + } + return {data_.get() + rptr_, avail}; + } + + std::size_t uncommit(std::size_t size) override { + if (size == 0) + return 0; + auto ret = do_uncommit(size); + if (ret < size) { + ret += do_uncommit(size - ret); + } + return ret; + } + + bool empty() const override { + return rptr_ == wptr_ && !full_; + } + + bool full() const override { + return rptr_ == wptr_ && full_; + } + + void reset() override { + rptr_ = 0; + wptr_ = 0; + full_ = false; + } + + private: + std::size_t ravail() const { + if (rptr_ < wptr_) + return wptr_ - rptr_; + if (rptr_ == wptr_ && !full_) + return 0; + return size_ - rptr_; + } + + std::size_t wavail() const { + if (rptr_ > wptr_) + return rptr_ - wptr_; + if (rptr_ == wptr_ && full_) + return 0; + return size_ - wptr_; + } + + std::size_t do_uncommit(std::size_t size) { + if (size == 0 || (rptr_ == wptr_ && !full_)) + return 0; + + full_ = false; + + if (wptr_ == 0) + wptr_ = size_; + + auto avail = rptr_ < wptr_ ? wptr_ - rptr_ : wptr_; + avail = std::min(avail, size); + wptr_ -= avail; + return avail; + } + + void rotate() { + assert(rptr_ > 0); + + if (rptr_ < wptr_) { + std::copy(data_.get() + rptr_, data_.get() + wptr_, data_.get()); + wptr_ -= rptr_; + rptr_ = 0; + } else if (wptr_ < rptr_ || (wptr_ == rptr_ && full_)) { + auto left = wptr_; + auto right = size_ - rptr_; + // TODO: Can we do this without allocations? + if (left <= right) { + auto tmp = std::make_unique<uint8_t[]>(left); + std::copy_n(data_.get(), left, tmp.get()); + std::copy_n(data_.get() + rptr_, right, data_.get()); + std::copy_n(tmp.get(), left, data_.get() + right); + } else { + auto tmp = std::make_unique<uint8_t[]>(right); + std::copy_n(data_.get() + rptr_, right, tmp.get()); + std::copy_backward(data_.get(), data_.get() + left, + data_.get() + left + right - 1); + std::copy_n(tmp.get(), right, data_.get()); + } + wptr_ = left + right; + if (wptr_ == size_) + wptr_ = 0; + rptr_ = 0; + } else { + assert(false); + } + } + + std::size_t const size_; + std::unique_ptr<uint8_t[]> data_; + std::size_t rptr_{0}; + std::size_t wptr_{0}; + bool full_{false}; +}; + +class ReadViewBufferImpl : public ReadViewBuffer { + public: + explicit ReadViewBufferImpl(std::unique_ptr<Buffer> buffer) + : buffer_(std::move(buffer)) {} + + std::size_t consumed() const override { + return offset_; + } + + std::unique_ptr<Buffer> release() override { + return std::move(buffer_); + } + + std::span<uint8_t> wspan(std::size_t need) override { + return buffer_->wspan(need); + } + + void commit(std::size_t size) override { + return buffer_->commit(size); + } + + std::span<uint8_t const> rspan(std::size_t want) override { + auto ret = buffer_->rspan(offset_ + want); + if (ret.size() <= offset_) + return ret.subspan(0, 0); + return ret.subspan(offset_, ret.size() - offset_); + } + + void consume(std::size_t size) override { + offset_ += size; + } + + std::span<uint8_t> mspan(std::size_t want) override { + auto ret = buffer_->mspan(offset_ + want); + if (ret.size() <= offset_) + return ret.subspan(0, 0); + return ret.subspan(offset_, ret.size() - offset_); + } + + std::size_t uncommit(std::size_t size) override { + return buffer_->uncommit(size); + } + + bool empty() const override { + if (buffer_->empty()) + return true; + auto data = buffer_->rspan(offset_ + 1); + return data.size() <= offset_; + } + + bool full() const override { + return buffer_->full(); + } + + void reset() override { + offset_ = 0; + } + + private: + std::unique_ptr<Buffer> buffer_; + std::size_t offset_{0}; +}; + +} // namespace + +std::unique_ptr<Buffer> make_buffer(std::size_t default_size, + std::size_t max_size) { + if (default_size >= max_size) + return std::make_unique<FixedBuffer>(max_size); + + return std::make_unique<DynamicBuffer>(default_size, max_size); +} + +std::unique_ptr<ReadViewBuffer> make_read_view_buffer( + std::unique_ptr<Buffer> buffer) { + return std::make_unique<ReadViewBufferImpl>(std::move(buffer)); +} + +std::size_t Buffer::write(std::span<uint8_t const> data) { + std::size_t offset = 0; + while (offset < data.size()) { + auto target = wspan(); + if (target.empty()) + break; + auto size = std::min(data.size() - offset, target.size()); + std::copy_n(data.data() + offset, size, target.data()); + commit(size); + offset += size; + } + return offset; +} + +bool Buffer::write_all(std::span<uint8_t const> data) { + if (data.empty()) + return true; + auto target = wspan(data.size()); + if (target.empty()) + return false; + std::copy(data.begin(), data.end(), target.begin()); + commit(data.size()); + return true; +} + +std::size_t Buffer::read(std::span<uint8_t> data) { + std::size_t offset = 0; + while (offset < data.size()) { + auto source = rspan(); + if (source.empty()) + break; + auto size = std::min(data.size() - offset, source.size()); + std::copy_n(source.data(), size, data.data() + offset); + consume(size); + offset += size; + } + return offset; +} + +bool Buffer::read_all(std::span<uint8_t> data) { + auto source = rspan(data.size()); + if (source.size() < data.size()) + return false; + std::copy_n(source.begin(), data.size(), data.begin()); + consume(data.size()); + return true; +} + +} // namespace sax +} // namespace modxml + diff --git a/sax/src/buffer.hh b/sax/src/buffer.hh new file mode 100644 index 0000000..d9fb9fc --- /dev/null +++ b/sax/src/buffer.hh @@ -0,0 +1,108 @@ +#ifndef BUFFER_HH +#define BUFFER_HH + +#include "macros.hh" + +#include <memory> +#include <span> + +namespace modxml { +namespace sax { + +class HIDDEN Buffer { + public: + virtual ~Buffer() = default; + + Buffer(Buffer const&) = delete; + Buffer& operator=(Buffer const&) = delete; + + // Returns a writable span, either at least need large or in case + // the buffer is full, an empty span. + // Returned span is valid until any other method is called on the buffer. + virtual std::span<uint8_t> wspan(std::size_t need = 1) = 0; + // Commit size data from the last returned wspan. size must be <= span.size. + // Remember that the span is now invalid and you need to call wspan again + // to write more. + virtual void commit(std::size_t size) = 0; + + // Returns a readable span of all readily available data in buffer. + // If there is enought data in the buffer to satisfy want, the returned + // span is at least as large. + // Returned span is valid until any other method is called on the buffer. + virtual std::span<uint8_t const> rspan(std::size_t want = 1) = 0; + // Consume size data from buffer. size must be <= span.size. + // Remember that the span is now invalid and you need to call rspan again + // to read more. + virtual void consume(std::size_t size) = 0; + + // Returns the same span as rspan but this is writable, you can modify + // the content. You cannot change the size of the span. + // If you wish to append data, use wspan() + commit(). + // If you wish to remove data, use uncommit(). + // If you wish to insert you have to be clever. + // Returned span is valid until any other method is called on the buffer. + virtual std::span<uint8_t> mspan(std::size_t want = 1) = 0; + + // Uncommit the last size bytes in the buffer. Returns the bytes + // removed. If you used wspan() + commit() to add ten (10) bytes say and then + // call uncommit() with a size of seven (7) the first three (3) bytes written + // will the left in the buffer. + virtual std::size_t uncommit(std::size_t size) = 0; + + // Returns true if buffer is empty. + virtual bool empty() const = 0; + + // Returns true if buffer is full. This means filled to max_size. + virtual bool full() const = 0; + + // Clear buffer, reset back to initial state. + virtual void reset() = 0; + + // Write as much as possible of data to buffer. + // Returns bytes written (may be zero). + std::size_t write(std::span<uint8_t const> data); + + // Either write all of the data to buffer or none. Returns true if data was + // written or data was empty. + bool write_all(std::span<uint8_t const> data); + + // Read as much as possible from buffer to data. + // Returns bytes read (may be zero). + std::size_t read(std::span<uint8_t> data); + + // Either fill data with data from buffer or return false. + bool read_all(std::span<uint8_t> data); + + protected: + Buffer() = default; +}; + +// Create a buffer. default_size is used as an hint but generally that +// will be the initial size of the buffer. max_size is an hard limit. +// max_size == 0 is valid but will return an always full and empty buffer. +std::unique_ptr<Buffer> HIDDEN make_buffer(std::size_t default_size, + std::size_t max_size); + +class ReadViewBuffer : public Buffer { + public: + // Returns bytes consumed in this buffer. + virtual std::size_t consumed() const = 0; + + // Take ownership back of the wrapped buffer from the read view. + // The read view is now unusable. + virtual std::unique_ptr<Buffer> release() = 0; + + protected: + ReadViewBuffer() = default; +}; + +// Create a read view buffer. Writing will go to wrapped buffer. Reading +// is done on the read view buffer without moving the wrapped buffers read +// pointer. These views are lightweight. +std::unique_ptr<ReadViewBuffer> HIDDEN make_read_view_buffer( + std::unique_ptr<Buffer> buffer); + +} // namespace sax +} // namespace modxml + +#endif // BUFFER_HH diff --git a/sax/src/decoder.cc b/sax/src/decoder.cc index 30b1735..35b9b46 100644 --- a/sax/src/decoder.cc +++ b/sax/src/decoder.cc @@ -12,273 +12,233 @@ namespace sax { namespace { -class UtfDecoder : public Decoder { +class KnownEndianDecoder : public Decoder { public: - State decode(std::string_view in, std::size_t& in_offset, - uint32_t* out, std::size_t out_size, - std::size_t& out_offset) override { - std::size_t const out_start = out_offset; + State decode(std::span<uint8_t const> in, std::size_t& in_offset, + std::span<uint8_t> out, std::size_t& out_offset) override { + std::size_t tmp = in_offset; + uint32_t ret = read(in, tmp); + if (ret == utf::NEED_MORE) + return State::NEED_MORE; + if (ret == utf::INVALID) + return State::INVALID; + if (bom_ == -1) UNLIKELY { - std::size_t tmp = in_offset; - uint32_t ret = read(in, tmp); - if (ret == utf::NEED_MORE) { - return State::NEED_MORE; - } - if (ret == utf::INVALID) { - return State::INVALID; - } if (ret == 0xfeff) { // To allow offset to advance and to return, we need to // read at least one more character completely. ret = read(in, tmp); - if (ret == utf::NEED_MORE) { + if (ret == utf::NEED_MORE) return State::NEED_MORE; - } - if (ret == utf::INVALID) { + if (ret == utf::INVALID) return State::INVALID; - } bom_ = 1; } else { bom_ = 0; } - in_offset = tmp; - out[out_offset++] = ret; - if (out_offset == out_size) - return State::GOOD; + if (!utf::write8(ret, out, out_offset)) { + bom_ = -1; + return State::NEED_MORE; + } + } else { + if (!utf::write8(ret, out, out_offset)) + return State::NEED_MORE; } + in_offset = tmp; - do { - uint32_t ret = read(in, in_offset); - if (ret == utf::NEED_MORE) { - return out_offset > out_start ? State::GOOD : State::NEED_MORE; - } - if (ret == utf::INVALID) { - return out_offset > out_start ? State::GOOD : State::INVALID; - } - out[out_offset++] = ret; - } while (out_offset < out_size); - return State::GOOD; + while (true) { + ret = read(in, tmp); + if (ret == utf::NEED_MORE || ret == utf::INVALID) + return State::GOOD; + if (!utf::write8(ret, out, out_offset)) + return State::GOOD; + in_offset = tmp; + } } protected: - UtfDecoder() = default; + KnownEndianDecoder() = default; - virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0; + virtual uint32_t read( + std::span<uint8_t const> data, std::size_t& offset) const = 0; private: int8_t bom_{-1}; }; -class Utf8Decoder : public UtfDecoder { +class Utf8Decoder : public KnownEndianDecoder { public: Utf8Decoder() = default; - uint32_t read(std::string_view data, std::size_t& offset) const override { + uint32_t read( + std::span<uint8_t const> data, std::size_t& offset) const override { return utf::read8(data, offset); } }; -class Utf16BeDecoder : public UtfDecoder { +class Utf16BeDecoder : public KnownEndianDecoder { public: Utf16BeDecoder() = default; - uint32_t read(std::string_view data, std::size_t& offset) const override { + uint32_t read( + std::span<uint8_t const> data, std::size_t& offset) const override { return utf::read16be(data, offset); } }; -class Utf16LeDecoder : public UtfDecoder { +class Utf16LeDecoder : public KnownEndianDecoder { public: Utf16LeDecoder() = default; - uint32_t read(std::string_view data, std::size_t& offset) const override { + uint32_t read( + std::span<uint8_t const> data, std::size_t& offset) const override { return utf::read16le(data, offset); } }; -class Utf32BeDecoder : public UtfDecoder { +class Utf32BeDecoder : public KnownEndianDecoder { public: Utf32BeDecoder() = default; - uint32_t read(std::string_view data, std::size_t& offset) const override { + uint32_t read( + std::span<uint8_t const> data, std::size_t& offset) const override { return utf::read32be(data, offset); } }; -class Utf32LeDecoder : public UtfDecoder { +class Utf32LeDecoder : public KnownEndianDecoder { public: Utf32LeDecoder() = default; - uint32_t read(std::string_view data, std::size_t& offset) const override { + uint32_t read( + std::span<uint8_t const> data, std::size_t& offset) const override { return utf::read32le(data, offset); } }; -class Utf16Decoder : public Decoder { +class UnknownEndianDecoder : public Decoder { public: - Utf16Decoder() = default; - - State decode(std::string_view in, std::size_t& in_offset, - uint32_t* out, std::size_t out_size, - std::size_t& out_offset) override { - std::size_t const out_start = out_offset; + State decode(std::span<uint8_t const> in, std::size_t& in_offset, + std::span<uint8_t> out, std::size_t& out_offset) override { + std::size_t tmp = in_offset; if (endian_ == -1) UNLIKELY { - std::size_t tmp = in_offset; - uint32_t ret = utf::read16be(in, tmp); - int8_t endian; - if (ret == utf::NEED_MORE) { + uint32_t ret = readbe(in, tmp); + if (ret == utf::NEED_MORE) return State::NEED_MORE; - } - if (ret == utf::INVALID) { + if (ret == utf::INVALID) return State::INVALID; - } if (ret == 0xfeff) { - endian = 1; // Big endian + endian_ = 1; } else if (ret == 0xfffe) { - endian = 0; // Little endian + endian_ = 0; } else { return State::INVALID; } + in_offset = tmp; + } - // To allow offset to advance and to return, we need to - // read at least one more character completely. - ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp); - if (ret == utf::NEED_MORE) { + if (endian_ == 0) { + uint32_t ret = readle(in, tmp); + if (ret == utf::NEED_MORE) return State::NEED_MORE; - } - if (ret == utf::INVALID) { + if (ret == utf::INVALID) return State::INVALID; - } + if (!utf::write8(ret, out, out_offset)) + return State::NEED_MORE; + in_offset = tmp; - endian_ = endian; + while (true) { + ret = readle(in, tmp); + if (ret == utf::NEED_MORE || ret == utf::INVALID) + return State::GOOD; + if (!utf::write8(ret, out, out_offset)) + return State::GOOD; + in_offset = tmp; + } + } else /* if (endian_ == 1) */ { + uint32_t ret = readbe(in, tmp); + if (ret == utf::NEED_MORE) + return State::NEED_MORE; + if (ret == utf::INVALID) + return State::INVALID; + if (!utf::write8(ret, out, out_offset)) + return State::NEED_MORE; in_offset = tmp; - out[out_offset++] = ret; - if (out_offset == out_size) - return State::GOOD; - } - if (endian_ == 1) { - do { - uint32_t ret = utf::read16be(in, in_offset); - if (ret == utf::NEED_MORE) { - return out_offset > out_start ? State::GOOD : State::NEED_MORE; - } - if (ret == utf::INVALID) { - return out_offset > out_start ? State::GOOD : State::INVALID; - } - out[out_offset++] = ret; - } while (out_offset < out_size); - } else { - do { - uint32_t ret = utf::read16le(in, in_offset); - if (ret == utf::NEED_MORE) { - return out_offset > out_start ? State::GOOD : State::NEED_MORE; - } - if (ret == utf::INVALID) { - return out_offset > out_start ? State::GOOD : State::INVALID; - } - out[out_offset++] = ret; - } while (out_offset < out_size); + while (true) { + ret = readbe(in, tmp); + if (ret == utf::NEED_MORE || ret == utf::INVALID) + return State::GOOD; + if (!utf::write8(ret, out, out_offset)) + return State::GOOD; + in_offset = tmp; + } } - return State::GOOD; } + protected: + UnknownEndianDecoder() = default; + + virtual uint32_t readle( + std::span<uint8_t const> data, std::size_t& offset) const = 0; + virtual uint32_t readbe( + std::span<uint8_t const> data, std::size_t& offset) const = 0; + private: int8_t endian_{-1}; }; -class Utf32Decoder : public Decoder { +class Utf16Decoder : public UnknownEndianDecoder { public: - Utf32Decoder() = default; + Utf16Decoder() = default; - State decode(std::string_view in, std::size_t& in_offset, - uint32_t* out, std::size_t out_size, - std::size_t& out_offset) override { - std::size_t const out_start = out_offset; - if (endian_ == -1) UNLIKELY { - std::size_t tmp = in_offset; - uint32_t ret = utf::read32be(in, tmp); - int8_t endian; - if (ret == utf::NEED_MORE) { - return State::NEED_MORE; - } - if (ret == utf::INVALID) { - tmp = in_offset; - ret = utf::read32le(in, tmp); - if (ret == 0xfeff) { - endian = 0; // Little endian - } else { - return State::INVALID; - } - } else if (ret == 0xfeff) { - endian = 1; // Big endian - } else { - return State::INVALID; - } + uint32_t readle( + std::span<uint8_t const> data, std::size_t& offset) const override { + return utf::read16le(data, offset); + } - // To allow offset to advance and to return, we need to - // read the next character completely. - ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp); - if (ret == utf::NEED_MORE) { - return State::NEED_MORE; - } - if (ret == utf::INVALID) { - return State::INVALID; - } + uint32_t readbe( + std::span<uint8_t const> data, std::size_t& offset) const override { + return utf::read16be(data, offset); + } +}; - endian_ = endian; - in_offset = tmp; - out[out_offset++] = ret; - if (out_offset == out_size) - return State::GOOD; - } +class Utf32Decoder : public UnknownEndianDecoder { + public: + Utf32Decoder() = default; - if (endian_ == 1) { - do { - uint32_t ret = utf::read32be(in, in_offset); - if (ret == utf::NEED_MORE) { - return out_offset > out_start ? State::GOOD : State::NEED_MORE; - } - if (ret == utf::INVALID) { - return out_offset > out_start ? State::GOOD : State::INVALID; - } - out[out_offset++] = ret; - } while (out_offset < out_size); - } else { - do { - uint32_t ret = utf::read32le(in, in_offset); - if (ret == utf::NEED_MORE) { - return out_offset > out_start ? State::GOOD : State::NEED_MORE; - } - if (ret == utf::INVALID) { - return out_offset > out_start ? State::GOOD : State::INVALID; - } - out[out_offset++] = ret; - } while (out_offset < out_size); - } - return State::GOOD; + uint32_t readle( + std::span<uint8_t const> data, std::size_t& offset) const override { + return utf::read32le(data, offset); } - private: - int8_t endian_{-1}; + uint32_t readbe( + std::span<uint8_t const> data, std::size_t& offset) const override { + return utf::read32be(data, offset); + } }; class AsciiDecoder : public Decoder { public: AsciiDecoder() = default; - State decode(std::string_view in, std::size_t& in_offset, - uint32_t* out, std::size_t out_size, - std::size_t& out_offset) override { - std::size_t const out_start = out_offset; - do { - if (in_offset == in.size()) - return out_offset > out_start ? State::GOOD : State::NEED_MORE; - if (in[in_offset] & 0x80) - return out_offset > out_start ? State::GOOD : State::INVALID; - out[out_offset++] = in[in_offset++]; - } while (out_offset < out_size); - return State::GOOD; + State decode(std::span<uint8_t const> in, std::size_t& in_offset, + std::span<uint8_t> out, std::size_t& out_offset) override { + if (in_offset >= in.size()) + return State::NEED_MORE; + if (in[in_offset] & 0x80) + return State::INVALID; + if (!utf::write8(in[in_offset], out, out_offset)) + return State::NEED_MORE; + ++in_offset; + + while (true) { + if (in_offset >= in.size() || in[in_offset] & 0x80) + return State::GOOD; + if (!utf::write8(in[in_offset], out, out_offset)) + return State::GOOD; + ++in_offset; + } } }; diff --git a/sax/src/guessing_decoder.cc b/sax/src/guessing_decoder.cc new file mode 100644 index 0000000..e72dab3 --- /dev/null +++ b/sax/src/guessing_decoder.cc @@ -0,0 +1,92 @@ +#include "guessing_decoder.hh" + +#include "decoder.hh" +#include "sax_decoder.hh" +#include "utf8.hh" +#include "utf_error.hh" + +#include <cassert> + +using namespace std::string_view_literals; + +namespace modxml { +namespace sax { + +namespace { + +bool eq(std::span<uint8_t const> a, std::size_t& a_offset, std::string_view b) { + if (a.size() - a_offset < b.size()) + return false; + for (size_t i = 0; i < b.size(); ++i) + if (a[a_offset + i] != b[i]) + return false; + return true; +} + +class GuessingDecoder : public Decoder { + public: + State decode(std::span<uint8_t const> in, std::size_t& in_offset, + std::span<uint8_t> out, std::size_t& out_offset) override { + assert(in_offset <= in.size()); + + if (!decided_) { + if (eq(in, in_offset, "\xef\xbb\xbf"sv)) { + decided_ = create_utf8_decoder(); + } else if (eq(in, in_offset, "\xfe\xff\x00\x00"sv)) { + in_offset += 4; + decided_ = create_utf32be_decoder(); + } else if (eq(in, in_offset, "\xfe\xff"sv)) { + // Could be UTF-32 BOM, need more data to decide + // (note, an xml document encoded in UTF-16 that is less than 4 bytes + // is rather impossible). + if (in.size() - in_offset < 4) + return State::NEED_MORE; + in_offset += 2; + decided_ = create_utf16be_decoder(); + } else if (eq(in, in_offset, "\xff\xfe"sv)) { + in_offset += 2; + decided_ = create_utf16le_decoder(); + } else if (eq(in, in_offset, "\x00\x00\xff\xfe"sv)) { + in_offset += 4; + decided_ = create_utf32le_decoder(); + } else { + auto avail = in.size() - in_offset; + if (avail == 0) + return State::NEED_MORE; + if (avail >= 4 && in[in_offset] == 0 && in[in_offset + 1] == 0 + && in[in_offset + 2] == 0 && in[in_offset + 3] != 0) { + decided_ = create_utf32le_decoder(); + } else if (avail >= 4 && in[in_offset] != 0 && in[in_offset + 1] == 0 + && in[in_offset + 2] == 0 && in[in_offset + 3] == 0) { + decided_ = create_utf32be_decoder(); + } else if (avail >= 2 && in[in_offset] == 0 && in[in_offset + 1] != 0) { + decided_ = create_utf16le_decoder(); + } else if (avail >= 2 && in[in_offset] != 0 && in[in_offset + 1] == 0) { + decided_ = create_utf16be_decoder(); + } else { + auto tmp = in_offset; + auto ret = utf::read8(in, tmp); + if (ret == utf::NEED_MORE) + return State::NEED_MORE; + if (ret == utf::INVALID) + return State::INVALID; + // UTF-8 should be good enough to read the XML declaration. + decided_ = create_utf8_decoder(); + } + } + } + return decided_->decode(in, in_offset, out, out_offset); + } + + private: + std::unique_ptr<Decoder> decided_; +}; + +} // namespace + +std::unique_ptr<Decoder> create_guessing_decoder() { + return std::make_unique<GuessingDecoder>(); +} + +} // namespace sax +} // namespace modxml diff --git a/sax/src/guessing_decoder.hh b/sax/src/guessing_decoder.hh new file mode 100644 index 0000000..0f42c3b --- /dev/null +++ b/sax/src/guessing_decoder.hh @@ -0,0 +1,21 @@ +#ifndef GUESSING_DECODER_HH +#define GUESSING_DECODER_HH + +#include "macros.hh" + +#include <memory> + +namespace modxml { +namespace sax { + +class Decoder; + +// Decoder that tries to figure out, using BOM or just magic +// what encoding is used, optimized for the first character to be +// '<'. +std::unique_ptr<Decoder> HIDDEN create_guessing_decoder(); + +} // namespace sax +} // namespace modxml + +#endif // GUESSING_DECODER_HH diff --git a/sax/src/sax_attributes.cc b/sax/src/sax_attributes.cc new file mode 100644 index 0000000..230c677 --- /dev/null +++ b/sax/src/sax_attributes.cc @@ -0,0 +1,38 @@ +#include "sax_attributes.hh" + +namespace modxml { +namespace sax { + +Attribute::Attribute(std::string_view name, std::string_view value) + : name(name), value(value) {} + +std::optional<std::string_view> Attributes::find_first(std::string_view name) + const { + for (auto it = begin(); it != end(); ++it) { + if (it->name == name) + return it->value; + } + return std::nullopt; +} + +std::optional<std::string_view> Attributes::find_last(std::string_view name) + const { + for (size_t i = size(); i > 0; --i) { + auto const& a = at(i - 1); + if (a.name == name) + return a.value; + } + return std::nullopt; +} + +std::optional<std::size_t> Attributes::find(std::string_view name, + std::size_t index) const { + for (; index < size(); ++index) { + if (at(index).name == name) + return index; + } + return std::nullopt; +} + +} // namespace sax +} // namespace modxml diff --git a/sax/src/sax_delegate.cc b/sax/src/sax_delegate.cc new file mode 100644 index 0000000..2c2cfcd --- /dev/null +++ b/sax/src/sax_delegate.cc @@ -0,0 +1,21 @@ +#include "sax_delegate.hh" + +namespace modxml { +namespace sax { + +void Delegate::start_element(std::string_view, Attributes const&) {} + +void Delegate::empty_element(std::string_view, Attributes const&) {} + +void Delegate::end_element(std::string_view) {} + +void Delegate::character_data(std::string_view) {} + +void Delegate::processing_instruction(std::string_view, std::string_view) {} + +void Delegate::comment(std::string_view) {} + +void Delegate::error(std::string_view) {} + +} // namespace sax +} // namespace modxml diff --git a/sax/src/sax_processor.cc b/sax/src/sax_processor.cc index ea9f753..afc9d3b 100644 --- a/sax/src/sax_processor.cc +++ b/sax/src/sax_processor.cc @@ -1,18 +1,41 @@ #include "sax_processor.hh" -#include "sax_decoder.hh" +#include <iostream> + +#include "buffer.hh" +#include "guessing_decoder.hh" #include "processor.hh" +#include "sax_attributes.hh" +#include "sax_decoder.hh" +#include "sax_decoder_factory.hh" +#include "sax_delegate.hh" +#include "utf8.hh" +#include "utf_error.hh" #include "utils.hh" #include <algorithm> +#include <cassert> +#include <charconv> +#include <format> +#include <map> #include <optional> #include <utility> +#include <vector> + +using namespace std::string_view_literals; namespace modxml { namespace sax { namespace { +constexpr std::size_t kDefaultBufferSize = 8192; +constexpr std::size_t kMinBufferSize = 128; + +inline bool is_digit(char c) { + return c >= '0' && c <= '9'; +} + // 2.2 Characters // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] @@ -75,12 +98,185 @@ inline bool is_namechar(uint32_t c) { (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040); } -/* [5] Name ::= NameStartChar (NameChar)* +/* +[5] Name ::= NameStartChar (NameChar)* [6] Names ::= Name (#x20 Name)* [7] Nmtoken ::= (NameChar)+ [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* */ +inline bool ascii_lowercase(char c) { + return (c >= 'A' & c <= 'Z') ? (c | 0x20) : c; +} + +bool eq_lowercase(std::string_view a, std::string_view b) { + if (a.size() != b.size()) + return false; + for (std::size_t i = 0; i < a.size(); ++i) + if (ascii_lowercase(a[i]) != b[i]) + return false; + return true; +} + +inline std::string_view make_string_view(std::span<uint8_t const> span) { + return std::string_view(reinterpret_cast<char const*>(span.data()), + span.size()); +} + +class Entities { + public: + Entities() { + data_.emplace("lt", "<"); + data_.emplace("gt", ">"); + data_.emplace("amp", "&"); + data_.emplace("apos", "'"); + data_.emplace("quot", "\""); + } + + std::optional<std::string> get(std::string const& entity) const { + if (entity.empty()) + return std::nullopt; + if (entity.front() == '#') { + if (entity.size() == 1) + return std::nullopt; + int base; + char const* start; + char const* end = entity.data() + entity.size(); + if (entity[1] == 'x') { + start = entity.data() + 2; + base = 16; + } else { + start = entity.data() + 1; + base = 10; + } + uint32_t value; + auto [ptr, ec] = std::from_chars(start, end, value, base); + if (ec == std::errc() && ptr == end) { + uint8_t tmp[4]; + std::size_t offset = 0; + utf::write8(value, tmp, offset); + return std::string(reinterpret_cast<char*>(tmp), offset); + } + } + auto it = data_.find(entity); + if (it == data_.end()) + return std::nullopt; + return it->second; + } + + private: + std::map<std::string, std::string> data_; +}; + +bool deamp(Entities const& entities, std::string& str, std::size_t last = 0) { + while (true) { + auto next = str.find('&', last); + if (next == std::string::npos) + break; + next += 1; + auto semicolon = str.find(';', next); + if (semicolon == std::string::npos) + return false; + auto replacement = entities.get(str.substr(next, semicolon - next)); + if (!replacement.has_value()) + return false; + } + return true; +} + +std::optional<std::string> unquote(Entities const& entities, + std::string_view quoted) { + assert(quoted.size() >= 2); + assert(quoted.front() == quoted.back()); + std::string ret(quoted.substr(1, quoted.size() - 2)); + if (deamp(entities, ret)) + return ret; + return std::nullopt; +} + +std::optional<std::string_view> unquote_if_needed(Entities const& entities, + std::string_view quoted, + std::string& tmp) { + assert(quoted.size() >= 2); + assert(quoted.front() == quoted.back()); + auto input = quoted.substr(1, quoted.size() - 2); + auto index = input.find('&'); + if (index == std::string_view::npos) + return input; + tmp.assign(input); + if (deamp(entities, tmp, index)) + return tmp; + return std::nullopt; +} + +class AttributesImpl : public Attributes { + public: + AttributesImpl() = default; + + bool init(Entities const& entities, + std::span<const uint8_t> data, + std::vector<size_t> const& offsets, + std::size_t first) { + std::size_t a = first; + attr_.reserve((offsets.size() - first) / 4); + while (a + 4 <= offsets.size()) { + auto name = make_string_view(data.subspan(offsets[a], offsets[a + 1])); + std::string tmp; + auto value = unquote_if_needed( + entities, + make_string_view(data.subspan(offsets[a + 2], offsets[a + 3])), + tmp); + if (!value.has_value()) + return false; + if (tmp.empty()) { + attr_.emplace_back(name, *value); + } else { + attr_.emplace_back(name, *value, std::move(tmp)); + } + a += 4; + } + return true; + } + + iterator begin() const override { + return Iterator(this, 0); + } + + iterator end() const override { + return Iterator(this, attr_.size()); + } + + std::size_t size() const override { + return attr_.size(); + } + + Attribute const& at(std::size_t index) const override { + return attr_[index]; + } + + private: + class Iterator : public iterator { + public: + Iterator(Attributes const* attributes, std::size_t index) + : iterator(attributes, index) {} + }; + + struct AttributeImpl : public Attribute { + AttributeImpl(std::string_view name, std::string_view value) + : Attribute(name, value) {} + + AttributeImpl(std::string_view name, std::string_view value, + std::string&& tmp) + : Attribute(name, value), tmp_(std::move(tmp)) {} + + private: + std::string tmp_; + }; + + std::span<const uint8_t> data_; + std::vector<AttributeImpl> attr_; +}; + class ProcessorImpl : public Processor { public: ProcessorImpl(std::shared_ptr<Delegate> delegate, @@ -91,15 +287,898 @@ class ProcessorImpl : public Processor { : delegate_(std::move(delegate)), decoder_factory_(std::move(decoder_factory)), decoder_(std::move(decoder)), - default_buffer_size_(default_buffer_size), - max_buffer_size_(max_buffer_size) {} + forced_decoder_(decoder_), + buffer_(make_buffer(default_buffer_size, max_buffer_size)) { + if (!decoder_) + decoder_ = create_guessing_decoder(); + + expect_document(); + } + + std::size_t process(std::span<uint8_t const> data, + std::size_t offset) override { + cmds_.emplace_back(Command::FILL_BUFFER, Count::ZERO_OR_ONE); + + std::size_t consumed = 0; + + while (true) { + if (cmds_.empty()) { + if (!buffer_->empty()) { + std::cerr << make_string_view(buffer_->rspan()) << std::endl; + delegate_->error("Extra data at end"); + } + return consumed; + } + + auto current = cmds_.back(); + auto const old_size = cmds_.size(); + cmds_.pop_back(); + Process ret; + switch (current.command) { + case Command::FILL_BUFFER: + ret = fill_buffer(data, offset, consumed); + break; + case Command::MISC: + ret = process_misc(current); + break; + case Command::SPACE: + ret = process_space(current); + break; + case Command::ELEMENT: + ret = process_element(current); + break; + case Command::COMMENT: + ret = process_comment(current); + break; + case Command::PROCESSING_INSTRUCTION: + ret = process_processing_instruction(current); + break; + case Command::XMLDECL: + ret = process_xmldecl(current); + break; + case Command::ATTRIBUTE: + ret = process_attribute(current); + break; + case Command::NAME: + ret = process_name(current); + break; + case Command::ATTRIBUTE_VALUE: + ret = process_attribute_value(current); + break; + case Command::EQUAL: + ret = process_equal(current); + break; + case Command::START_OR_EMPTY_TAG: + ret = process_start_or_empty_tag(current); + break; + case Command::END_TAG: + ret = process_end_tag(current); + break; + } + + switch (ret) { + case Process::NEED_MORE: + case Process::ERROR: + cmds_.push_back(current); + assert(cmds_.size() == old_size); + return consumed; + case Process::CONTINUE: + break; + } + } + } + + uint64_t line() const override { return line_; } + + uint64_t column() const override { return column_; } private: + enum class Process { + NEED_MORE, + ERROR, + CONTINUE, + }; + + enum class Match { + FULL_MATCH, + PARTIAL_MATCH, + NO_MATCH, + }; + + enum class Command { + FILL_BUFFER, + + ATTRIBUTE, + ATTRIBUTE_VALUE, + COMMENT, + ELEMENT, + END_TAG, + EQUAL, + MISC, + NAME, + PROCESSING_INSTRUCTION, + SPACE, + START_OR_EMPTY_TAG, + XMLDECL, + }; + + enum class Count { + ONE, + ONE_OR_MANY, + ZERO_OR_ONE, + ZERO_OR_MANY, + }; + + struct CommandItem { + Command const command; + Count const count; + std::size_t offset; + + CommandItem(Command command, Count count, std::size_t offset = 0) + : command(command), count(count), offset(offset) {} + }; + + struct StackItem { + std::vector<std::size_t> offsets; + }; + + Process fill_buffer(std::span<uint8_t const> data, + std::size_t offset, + std::size_t& consumed) { + if (offset >= data.size()) + return Process::NEED_MORE; + + std::size_t tmp = offset; + auto wspan = buffer_->wspan(4); + switch (decoder_->decode(data, tmp, wspan, consumed)) { + case Decoder::State::GOOD: + break; + case Decoder::State::NEED_MORE: + return Process::NEED_MORE; + case Decoder::State::INVALID: + delegate_->error("Invalid data"); + return Process::ERROR; + } + buffer_->commit(consumed); + return Process::CONTINUE; + } + + void expect_document() { + // document := prolog element Misc* + expect_misc(Count::ZERO_OR_MANY); + expect_element(Count::ONE); + expect_prolog(); + } + + void expect_misc(Count count) { + cmds_.emplace_back(Command::MISC, count); + } + + void expect_element(Count count) { + // element ::= EmptyElemTag | STag content ETag + cmds_.emplace_back(Command::START_OR_EMPTY_TAG, count); + } + + void expect_end_tag(Count count) { + cmds_.emplace_back(Command::END_TAG, count); + } + + void expect_prolog() { + // prolog := XMLDecl? Misc* (doctypedecl Misc*)? + expect_misc(Count::ZERO_OR_MANY); + expect_doctypedecl(Count::ZERO_OR_ONE); + expect_misc(Count::ZERO_OR_MANY); + expect_xmldecl(Count::ZERO_OR_ONE); + } + + void expect_xmldecl(Count count) { + cmds_.emplace_back(Command::XMLDECL, count); + } + + void expect_doctypedecl(Count) { + // TODO + } + + void expect_comment(Count count, std::size_t start_offset = 0) { + // Comment should never be more than one, should be MISC that is repeated. + assert(count == Count::ONE); + cmds_.emplace_back(Command::COMMENT, count, start_offset); + } + + void expect_content(Count) { + // TODO + } + + void expect_pi(Count count, std::size_t start_offset = 0) { + // PI should never be more than one, should be MISC that is repeated. + assert(count == Count::ONE); + cmds_.emplace_back(Command::PROCESSING_INSTRUCTION, count, start_offset); + } + + void expect_space(Count count) { + // There is not way to have SS as S is continous, so we should never + // ask for more than one or zero. + assert(count == Count::ZERO_OR_ONE || count == Count::ONE); + cmds_.emplace_back(Command::SPACE, count); + } + + void expect_attribute(Count count) { + switch (count) { + case Count::ONE_OR_MANY: + cmds_.emplace_back(Command::ATTRIBUTE, Count::ZERO_OR_MANY); + case Count::ONE: + // Attribute ::= Name Eq AttValue + expect_attribute_value(Count::ONE); + expect_equal(Count::ONE); + expect_name(Count::ONE); + expect_space(Count::ONE); + break; + case Count::ZERO_OR_ONE: + case Count::ZERO_OR_MANY: + cmds_.emplace_back(Command::ATTRIBUTE, count); + break; + } + } + + void expect_attribute_value(Count count) { + cmds_.emplace_back(Command::ATTRIBUTE_VALUE, count); + } + + void expect_equal(Count count) { + // Eq ::= S? '=' S? + expect_space(Count::ZERO_OR_ONE); + cmds_.emplace_back(Command::EQUAL, count); + expect_space(Count::ZERO_OR_ONE); + } + + void expect_name(Count count) { + cmds_.emplace_back(Command::NAME, count); + } + + Process process_misc(CommandItem const& item) { + // Misc := Comment | PI | S + assert(item.offset == 0); + + switch (match("<!--")) { + case Match::FULL_MATCH: + add_if_more(item); + expect_comment(Count::ONE, 3); + return Process::CONTINUE; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + break; + } + + switch (match("<?")) { + case Match::FULL_MATCH: + add_if_more(item); + expect_pi(Count::ONE, 2); + return Process::CONTINUE; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + break; + } + + switch (match_s()) { + case Match::FULL_MATCH: + add_if_more(item); + expect_space(Count::ONE); + return Process::CONTINUE; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + break; + } + + return no_match(item); + } + + Process process_attribute(CommandItem& item) { + // This actually parses (S Attribute)* when followed by S? + // for Attribute parsing see expect_attribute() + // So we need to figure out if the S means start of attribute + // or just an S. We do this by checking if the first non-S is + // a namestart or something else. We consume the S. + uint32_t last_char; + auto ret = consume_space(item.offset, last_char); + if (ret != Process::CONTINUE) + return ret; + + // No S, cannot be followed by an attribute then. + if (item.offset == 0) + return no_match(item); + + // First character after S isn't a valid first character of a name, + // cannot be followed by an attribute then. + if (!is_namestartchar(last_char)) + return no_match(item); + + expect_attribute_value(Count::ONE); + expect_equal(Count::ONE); + expect_name(Count::ONE); + return Process::CONTINUE; + } + + Process process_equal(CommandItem const& item) { + // Eq ::= S? '=' S? + // Spacing added by expect_equal + switch (match_consume("=")) { + case Match::FULL_MATCH: + add_if_more(item); + return Process::CONTINUE; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + return no_match(item); + } + } + + Process process_name(CommandItem& item) { + // Name ::= NameStartChar (NameChar)* + auto data = buffer_->rspan(item.offset + 4); + while (true) { + std::size_t tmp = item.offset; + auto c = utf::read8(data, tmp); + if (c == utf::NEED_MORE) + return Process::NEED_MORE; + if (c == utf::INVALID || !valid_char(c)) + return invalid_char(data, tmp); + if (item.offset == 0) { + if (!is_namestartchar(c)) + return no_match(item); + } else { + if (!is_namechar(c)) + break; + } + item.offset = tmp; + } + + assert(!stack_.empty()); + auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get()); + stack_.back().offsets.push_back(read_view->consumed()); + stack_.back().offsets.push_back(item.offset); + buffer_->consume(item.offset); + return Process::CONTINUE; + } + + Process process_attribute_value(CommandItem& item) { + // AttValue ::= '"' ([^<&"] | Reference)* '"' + // | "'" ([^<&'] | Reference)* "'" + + uint32_t end_char; + auto data = buffer_->rspan(item.offset + 4); + + if (item.offset == 0) { + std::size_t tmp = item.offset; + auto c = utf::read8(data, tmp); + if (c == utf::NEED_MORE) + return Process::NEED_MORE; + if (c == utf::INVALID || !valid_char(c)) + return invalid_char(data, tmp); + if (c != '"' && c != '\'') + return no_match(item); + item.offset = tmp; + end_char = c; + } else { + assert(!data.empty()); + end_char = data[0]; // ok as both " and ' are ASCII + } + + while (true) { + auto c = utf::read8(data, item.offset); + if (c == utf::NEED_MORE) + return Process::NEED_MORE; + if (c == utf::INVALID || !valid_char(c)) + return invalid_char(data, item.offset); + if (c == end_char) + break; + // TODO: Should we validate reference already here or do we let + // unquoute take care of that? As Reference can't contain end_char + // only checking for end_char is safe here. + } + + assert(!stack_.empty()); + auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get()); + stack_.back().offsets.push_back(read_view->consumed()); + stack_.back().offsets.push_back(item.offset); + buffer_->consume(item.offset); + return Process::CONTINUE; + } + + Process process_comment(CommandItem& item) { + if (item.offset == 0) { + switch (match_consume("<!--")) { + case Match::FULL_MATCH: + item.offset += 3; + break; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + return no_match(item); + } + } + + auto match = find("-->", item.offset); + switch (match) { + case Match::FULL_MATCH: { + auto data = buffer_->rspan(item.offset); + assert(data.size() >= item.offset); + delegate_->comment( + make_string_view(data.subspan(3, item.offset - 3))); + buffer_->consume(item.offset + 3); + return Process::CONTINUE; + } + case Match::NO_MATCH: + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + } + } + + Process process_processing_instruction(CommandItem& item) { + if (item.offset == 0) { + switch (match_consume("<?")) { + case Match::FULL_MATCH: + item.offset += 2; + break; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + return no_match(item); + } + } + + // TODO + delegate_->error("PI not supported"); + return Process::ERROR; + } + + void add_to_stack(CommandItem const& item, std::size_t offset) { + cmds_.emplace_back(item.command, item.count, offset); + stack_.emplace_back(); + buffer_ = make_read_view_buffer(std::move(buffer_)); + buffer_->consume(offset); + } + + std::size_t pop_stack(std::vector<std::size_t>& attr) { + assert(!stack_.empty()); + std::swap(attr, stack_.back().offsets); + + auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get()); + auto consumed = read_view->consumed(); + + buffer_ = read_view->release(); + stack_.pop_back(); + + return consumed; + } + + Process process_xmldecl(CommandItem const& item) { + // XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' + if (item.offset == 0) { + switch (match("<?xml")) { + case Match::FULL_MATCH: + add_to_stack(item, /* offset */ 5); + expect_space(Count::ZERO_OR_ONE); + // Parsing as generic "Attribute" here and doing validation later. + expect_attribute(Count::ONE_OR_MANY); + return Process::CONTINUE; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + return no_match(item); + } + } + + assert(item.offset == 5); + + // Remember that this is still reading for the read view buffer. + switch (match_consume("?>")) { + case Match::FULL_MATCH: + break; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + delegate_->error(std::format("Expected end of {}", + command_name(item.command))); + return Process::ERROR; + } + + std::vector<std::size_t> attr; + auto const consumed = pop_stack(attr); + + // Now we're back to the real buffer + auto data = buffer_->rspan(consumed); + std::size_t a = 0; + + if (a + 4 <= attr.size() && + make_string_view(data.subspan(attr[a + 0], + attr[a + 1])) == "version") { + auto version = make_string_view(data.subspan(attr[a + 2] + 1, + attr[a + 3] - 2)); + if (!valid_version(version)) { + delegate_->error(std::format("Unsupported xmldecl version, {}", + version)); + return Process::ERROR; + } + a += 4; + } else { + // No version + delegate_->error("Invalid xmldecl, must have a version attribute first."); + return Process::ERROR; + } + + if (a + 4 <= attr.size() && + make_string_view(data.subspan(attr[a + 0], + attr[a + 1])) == "encoding") { + auto encoding = make_string_view(data.subspan(attr[a + 2] + 1, + attr[a + 3] - 2)); + if (forced_decoder_) { + // encoding value is ignored + // TODO: Should we check that it is valid anyway? + } else { + auto decoder = pick_decoder_for_encoding(encoding, nullptr); + if (!decoder && decoder_factory_) + decoder = decoder_factory_->create(encoding); + if (!decoder) { + delegate_->error(std::format("Unknown encoding {}", encoding)); + return Process::ERROR; + } + std::swap(decoder_, decoder); + // TODO: Re-decode the rest of the buffer? + } + a += 4; + } + + if (a + 4 <= attr.size() && + make_string_view(data.subspan(attr[a + 0], + attr[a + 1])) == "standalone") { + auto sd = make_string_view(data.subspan(attr[a + 2] + 1, + attr[a + 3] - 2)); + if (sd == "yes") { + // TODO: Handle standalone == yes + } else if (sd == "no") { + // TODO: Handle standalone == no + } else { + delegate_->error(std::format( + "Invalid xmldecl, standalone attribute has unsupported value, {}", + sd)); + return Process::ERROR; + } + a += 4; + } + + if (a < attr.size()) { + delegate_->error( + std::format("Invalid xmldecl, unknown attribute, {}", + make_string_view(data.subspan(attr[a + 0], + attr[a + 1])))); + return Process::ERROR; + } + + buffer_->consume(consumed); + return Process::CONTINUE; + } + + Process process_start_or_empty_tag(CommandItem const& item) { + // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' + // STag ::= '<' Name (S Attribute)* S? '>' + if (item.offset == 0) { + switch (match("<")) { + case Match::FULL_MATCH: + add_to_stack(item, /* offset */ 1); + expect_space(Count::ZERO_OR_ONE); + expect_attribute(Count::ZERO_OR_MANY); + expect_name(Count::ONE); + return Process::CONTINUE; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + return no_match(item); + } + } + + assert(item.offset == 1); + + bool empty_tag; + + // Remember that this is still reading for the read view buffer. + switch (match_consume("/>")) { + case Match::FULL_MATCH: + empty_tag = true; + break; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + switch (match_consume(">")) { + case Match::FULL_MATCH: + empty_tag = false; + break; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + delegate_->error(std::format("Expected end of {}", + command_name(item.command))); + return Process::ERROR; + } + break; + } + + std::vector<std::size_t> attr; + auto const consumed = pop_stack(attr); + + // Now we're back to the real buffer + auto data = buffer_->rspan(consumed); + + assert(attr.size() >= 2); + auto name = make_string_view(data.subspan(attr[0], attr[1])); + + AttributesImpl attributes; + if (!attributes.init(entities_, data, std::move(attr), 2)) { + delegate_->error("Invalid references in attribute values"); + return Process::ERROR; + } + + add_if_more(item); + + if (empty_tag) { + delegate_->empty_element(name, attributes); + } else { + delegate_->start_element(name, attributes); + expect_end_tag(Count::ONE); + expect_content(Count::ONE); + } + + buffer_->consume(consumed); + return Process::CONTINUE; + } + + Process process_end_tag(CommandItem const& item) { + // ETag ::= '</' Name S? '>' + if (item.offset == 0) { + switch (match("</")) { + case Match::FULL_MATCH: + add_to_stack(item, /* offset */ 2); + expect_space(Count::ZERO_OR_ONE); + expect_name(Count::ONE); + return Process::CONTINUE; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + return no_match(item); + } + } + + assert(item.offset == 1); + + // Remember that this is still reading for the read view buffer. + switch (match_consume(">")) { + case Match::FULL_MATCH: + break; + case Match::PARTIAL_MATCH: + return Process::NEED_MORE; + case Match::NO_MATCH: + delegate_->error(std::format("Expected end of {}", + command_name(item.command))); + return Process::ERROR; + } + + std::vector<std::size_t> attr; + auto const consumed = pop_stack(attr); + + // Now we're back to the real buffer + auto data = buffer_->rspan(consumed); + + assert(attr.size() == 2); + auto name = make_string_view(data.subspan(attr[0], attr[1])); + + add_if_more(item); + + delegate_->end_element(name); + + buffer_->consume(consumed); + return Process::CONTINUE; + } + + static bool valid_version(std::string_view version) { + if (version.size() < 3) + return false; + if (!version.starts_with("1.")) + return false; + for (std::size_t i = 2; i < version.size(); ++i) { + if (!is_digit(version[i])) + return false; + } + return true; + } + + Process process_element(CommandItem& item) { + // TODO + delegate_->error("Element is not yet supported"); + return Process::ERROR; + } + + Process consume_space(std::size_t& count, uint32_t& last_char) { + auto data = buffer_->rspan(4); + std::size_t consumed = 0; + while (true) { + std::size_t offset = consumed; + auto c = utf::read8(data, offset); + if (c == utf::NEED_MORE) { + buffer_->consume(consumed); + return Process::NEED_MORE; + } + if (c == utf::INVALID || !valid_char(c)) + return invalid_char(data, offset); + if (!is_ws(c)) { + last_char = c; + buffer_->consume(consumed); + return Process::CONTINUE; + } + ++count; + handle_ws(c); + consumed = offset; + } + } + + Process process_space(CommandItem& item) { + // S ::= (#x20 | #x9 | #xD | #xA)+ + // item.offset is only used to count spaces. We consume each space as it + // is found so no offset in buffer. + uint32_t unused; + auto ret = consume_space(item.offset, unused); + if (ret != Process::CONTINUE) + return ret; + + if (item.offset == 0) + return no_match(item); + + add_if_more(item); + return Process::CONTINUE; + } + + void add_if_more(CommandItem const& item) { + switch (item.count) { + case Count::ONE: + break; + case Count::ONE_OR_MANY: + cmds_.emplace_back(item.command, Count::ZERO_OR_MANY); + break; + case Count::ZERO_OR_ONE: + break; + case Count::ZERO_OR_MANY: + cmds_.emplace_back(item.command, item.count); + } + } + + Match find(std::string_view str, std::size_t& offset) { + auto data = buffer_->rspan(offset + str.size()); + std::size_t i = 0; + while (offset < data.size()) { + if (str[i] == data[offset]) { + ++i; + if (i == str.size()) { + offset -= i; + return Match::FULL_MATCH; + } + } else { + i = 0; + } + ++offset; + } + if (i > 0) { + offset -= i; + return Match::PARTIAL_MATCH; + } + return Match::NO_MATCH; + } + + Match match(std::string_view str, std::size_t offset = 0) { + auto data = buffer_->rspan(offset + str.size()); + if (data.size() <= offset) + return Match::PARTIAL_MATCH; + auto const avail = std::min(str.size(), data.size() - offset); + for (std::size_t i = 0; i < avail; ++i) { + if (str[i] != data[offset + i]) + return Match::NO_MATCH; + } + if (avail < str.size()) + return Match::PARTIAL_MATCH; + return Match::FULL_MATCH; + } + + Match match_consume(std::string_view str) { + auto ret = match(str); + if (ret == Match::FULL_MATCH) + buffer_->consume(str.size()); + return ret; + } + + Match match_s() { + auto data = buffer_->rspan(4); + std::size_t offset = 0; + auto c = utf::read8(data, offset); + if (c == utf::NEED_MORE) + return data.size() == 0 ? Match::PARTIAL_MATCH : Match::NO_MATCH; + if (c == utf::INVALID) + return Match::NO_MATCH; + if (!valid_char(c) || !is_ws(c)) + return Match::NO_MATCH; + return Match::FULL_MATCH; + } + + Process no_match(CommandItem const& item) { + switch (item.count) { + case Count::ONE: + case Count::ONE_OR_MANY: + delegate_->error(std::format("Expected {}", + command_name(item.command))); + return Process::ERROR; + case Count::ZERO_OR_ONE: + case Count::ZERO_OR_MANY: + break; + } + return Process::CONTINUE; + } + + void handle_ws(uint32_t c) { + if (c == '\n') { + ++line_; + column_ = 0; + } else { + ++column_; + } + } + + Process invalid_char(std::span<uint8_t const> data, std::size_t offset) { + delegate_->error(std::format("Invalid char {:02x}", data[offset])); + return Process::ERROR; + } + + static std::string_view command_name(Command command) { + switch (command) { + case Command::MISC: + return "misc"sv; + case Command::FILL_BUFFER: + return "more data"sv; + case Command::ELEMENT: + return "element"sv; + case Command::SPACE: + return "whitespace"sv; + case Command::COMMENT: + return "comment"sv; + case Command::PROCESSING_INSTRUCTION: + return "processing instruction"sv; + case Command::XMLDECL: + return "xml declaration"sv; + case Command::ATTRIBUTE: + return "attribute"sv; + case Command::ATTRIBUTE_VALUE: + return "attribute value"sv; + case Command::NAME: + return "name"sv; + case Command::EQUAL: + return "equal sign (=)"sv; + case Command::START_OR_EMPTY_TAG: + return "element"sv; + case Command::END_TAG: + return "end tag"sv; + } + assert(false); + return {}; + } + std::shared_ptr<Delegate> delegate_; std::shared_ptr<DecoderFactory> decoder_factory_; std::unique_ptr<Decoder> decoder_; - std::size_t default_buffer_size_; - std::size_t max_buffer_size_; + bool const forced_decoder_; + std::unique_ptr<Buffer> buffer_; + Entities entities_; + std::vector<CommandItem> cmds_; + std::vector<StackItem> stack_; + uint64_t line_{1}; + uint64_t column_{0}; }; } // namespace @@ -117,9 +1196,9 @@ std::unique_ptr<Processor> create_processor( decoder_factory.get()); } - std::size_t default_buffer_size = 8192; + std::size_t default_buffer_size = kDefaultBufferSize; if (opt_default_buffer_size.has_value()) - default_buffer_size = std::max(static_cast<std::size_t>(128), + default_buffer_size = std::max(kMinBufferSize, opt_default_buffer_size.value()); // This value is documented in public headers. Do NOT change. std::size_t max_buffer_size = 10 * 1024 * 1024; @@ -136,7 +1215,8 @@ std::unique_ptr<Processor> create_processor( max_buffer_size); } -std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate) { +std::unique_ptr<Processor> +Processor::create(std::shared_ptr<Delegate> delegate) { return create_processor(std::move(delegate), nullptr, std::nullopt, std::nullopt, std::nullopt); } diff --git a/sax/src/utils.cc b/sax/src/utils.cc index f0366d5..e3a53b1 100644 --- a/sax/src/utils.cc +++ b/sax/src/utils.cc @@ -9,7 +9,7 @@ namespace sax { namespace { -std::string cleanup_encoding(std::string const& str) { +std::string cleanup_encoding(std::string_view str) { std::string ret; ret.reserve(str.size()); for (auto c : str) { @@ -29,29 +29,29 @@ std::string cleanup_encoding(std::string const& str) { // Names inspired by: // https://www.iana.org/assignments/character-sets/character-sets.xhtml std::unique_ptr<Decoder> pick_decoder_for_encoding( - std::string const& encoding, DecoderFactory* factory) { + std::string_view encoding, DecoderFactory* factory) { auto clean_enc = cleanup_encoding(encoding); - if (clean_enc == "utf-8" || clean_enc == "utf8") { + if (clean_enc == "utf-8" || clean_enc == "utf8") return create_utf8_decoder(); - } - if (clean_enc == "utf-16" || clean_enc == "utf16") { + + if (clean_enc == "utf-16" || clean_enc == "utf16") return create_utf16_decoder(); - } - if (clean_enc == "utf-16be" || clean_enc == "utf16be") { + + if (clean_enc == "utf-16be" || clean_enc == "utf16be") return create_utf16be_decoder(); - } - if (clean_enc == "utf-16le" || clean_enc == "utf16le") { + + if (clean_enc == "utf-16le" || clean_enc == "utf16le") return create_utf16le_decoder(); - } - if (clean_enc == "utf-32" || clean_enc == "utf32") { + + if (clean_enc == "utf-32" || clean_enc == "utf32") return create_utf32_decoder(); - } - if (clean_enc == "utf-32be" || clean_enc == "utf32be") { + + if (clean_enc == "utf-32be" || clean_enc == "utf32be") return create_utf32be_decoder(); - } - if (clean_enc == "utf-32le" || clean_enc == "utf32le") { + + if (clean_enc == "utf-32le" || clean_enc == "utf32le") return create_utf32le_decoder(); - } + if (clean_enc == "ascii" || clean_enc == "us-ascii" || clean_enc == "usascii" || clean_enc == "iso-ir-6" || clean_enc == "ansi-x3-4-1968" || clean_enc == "ansi-x3-4-1986" || @@ -59,9 +59,10 @@ std::unique_ptr<Decoder> pick_decoder_for_encoding( clean_enc == "us" || clean_enc == "ibm367" || clean_enc == "cp367") { return create_ascii_decoder(); } - if (factory) { + + if (factory) return factory->create(encoding); - } + return nullptr; } diff --git a/sax/src/utils.hh b/sax/src/utils.hh index 206d003..074f0c0 100644 --- a/sax/src/utils.hh +++ b/sax/src/utils.hh @@ -4,7 +4,7 @@ #include "macros.hh" #include <memory> -#include <string> +#include <string_view> namespace modxml { namespace sax { @@ -13,7 +13,7 @@ class Decoder; class DecoderFactory; std::unique_ptr<Decoder> HIDDEN pick_decoder_for_encoding( - std::string const& encoding, + std::string_view encoding, DecoderFactory* factory); } // namespace sax diff --git a/sax/tst/test_buffer.cc b/sax/tst/test_buffer.cc new file mode 100644 index 0000000..13bc6d4 --- /dev/null +++ b/sax/tst/test_buffer.cc @@ -0,0 +1,272 @@ +#include "buffer.hh" + +#include <gmock/gmock.h> +#include <gtest/gtest.h> + +namespace { + +enum class BufferType { + FIXED, + DYNAMIC, +}; + +class BufferTest : public testing::TestWithParam<BufferType> { + protected: + std::unique_ptr<modxml::sax::Buffer> make_buffer(std::size_t size) { + switch (GetParam()) { + case BufferType::FIXED: + return modxml::sax::make_buffer(size, size); + case BufferType::DYNAMIC: + return modxml::sax::make_buffer(size / 2, size); + } + return nullptr; + } +}; + +std::array<uint8_t, 10> AAAAAAAAAA{ + 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; +std::array<uint8_t, 5> BBBBB{ + 'B', 'B', 'B', 'B', 'B'}; + +} // namespace + +TEST_P(BufferTest, sanity) { + auto buf = make_buffer(10); + EXPECT_TRUE(buf->empty()); + EXPECT_FALSE(buf->full()); + + EXPECT_TRUE(buf->write_all(AAAAAAAAAA)); + EXPECT_TRUE(buf->full()); + EXPECT_FALSE(buf->empty()); + + EXPECT_FALSE(buf->write_all(AAAAAAAAAA)); + + std::array<uint8_t, 10> tmp10; + EXPECT_TRUE(buf->read_all(tmp10)); + EXPECT_THAT(tmp10, testing::ContainerEq(AAAAAAAAAA)); + EXPECT_TRUE(buf->empty()); + EXPECT_FALSE(buf->full()); + + EXPECT_TRUE(buf->write_all(BBBBB)); + EXPECT_FALSE(buf->full()); + EXPECT_FALSE(buf->empty()); + + EXPECT_EQ(5u, buf->write(AAAAAAAAAA)); + EXPECT_TRUE(buf->full()); + EXPECT_FALSE(buf->empty()); + + std::array<uint8_t, 3> tmp3; + EXPECT_TRUE(buf->read_all(tmp3)); + EXPECT_THAT(tmp3, testing::ElementsAre('B', 'B', 'B')); + + EXPECT_EQ(3u, buf->write(BBBBB)); + + EXPECT_TRUE(buf->read_all(tmp3)); + EXPECT_THAT(tmp3, testing::ElementsAre('B', 'B', 'A')); + + std::array<uint8_t, 5> tmp5; + EXPECT_TRUE(buf->read_all(tmp5)); + EXPECT_THAT(tmp5, testing::ElementsAre('A', 'A', 'A', 'A', 'B')); + + EXPECT_FALSE(buf->read_all(tmp3)); + tmp3[2] = 'X'; + EXPECT_EQ(2u, buf->read(tmp3)); + EXPECT_THAT(tmp3, testing::ElementsAre('B', 'B', 'X')); +} + +TEST_P(BufferTest, noop) { + auto buf = make_buffer(10); + EXPECT_TRUE(buf->empty()); + + std::array<uint8_t, 0> empty; + EXPECT_EQ(0u, buf->write(empty)); + EXPECT_EQ(0u, buf->read(empty)); + + EXPECT_TRUE(buf->write_all(empty)); + EXPECT_TRUE(buf->read_all(empty)); + + buf->commit(0); + buf->consume(0); + + EXPECT_TRUE(buf->empty()); +} + +TEST_P(BufferTest, one_byte_filler) { + auto buf = make_buffer(10); + + std::array<uint8_t, 1> tmp1; + uint8_t out = 0; + for (uint8_t in = 0; in <= 20; ++in) { + tmp1[0] = in; + EXPECT_TRUE(buf->write_all(tmp1)); + if (in >= 9) { + EXPECT_TRUE(buf->read_all(tmp1)); + EXPECT_EQ(tmp1[0], out); + ++out; + } + } + for (; out <= 20; ++out) { + EXPECT_TRUE(buf->read_all(tmp1)); + EXPECT_EQ(tmp1[0], out); + } + EXPECT_TRUE(buf->empty()); +} + +TEST_P(BufferTest, read_wrap) { + auto buf = make_buffer(10); + + EXPECT_TRUE(buf->write_all(BBBBB)); + EXPECT_EQ(5u, buf->write(AAAAAAAAAA)); + + std::array<uint8_t, 5> tmp5; + EXPECT_TRUE(buf->read_all(tmp5)); + EXPECT_THAT(tmp5, testing::ContainerEq(BBBBB)); + + EXPECT_EQ(5u, buf->write(AAAAAAAAAA)); + + std::array<uint8_t, 10> tmp10; + EXPECT_TRUE(buf->read_all(tmp10)); + EXPECT_THAT(tmp10, testing::ContainerEq(AAAAAAAAAA)); +} + +TEST_P(BufferTest, skip_wrap) { + auto buf = make_buffer(10); + + EXPECT_TRUE(buf->write_all(BBBBB)); + EXPECT_EQ(5u, buf->write(AAAAAAAAAA)); + + buf->consume(5); + EXPECT_FALSE(buf->empty()); + + EXPECT_EQ(5u, buf->write(AAAAAAAAAA)); + + buf->consume(10); + EXPECT_TRUE(buf->empty()); +} + +TEST_P(BufferTest, write_wrap) { + auto buf = make_buffer(12); + + EXPECT_TRUE(buf->write_all(BBBBB)); + + std::array<uint8_t, 3> tmp3; + EXPECT_TRUE(buf->read_all(tmp3)); + EXPECT_THAT(tmp3, testing::ElementsAre('B', 'B', 'B')); + + EXPECT_TRUE(buf->write_all(AAAAAAAAAA)); + + std::array<uint8_t, 12> tmp12; + EXPECT_EQ(12u, buf->read(tmp12)); + EXPECT_THAT(tmp12, testing::ElementsAre( + 'B', 'B', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A')); +} + +TEST_P(BufferTest, read_wrap2) { + auto buf = make_buffer(12); + + EXPECT_TRUE(buf->write_all(AAAAAAAAAA)); + + std::array<uint8_t, 7> tmp7; + EXPECT_TRUE(buf->read_all(tmp7)); + EXPECT_THAT(tmp7, testing::ElementsAre('A', 'A', 'A', 'A', 'A', 'A', 'A')); + + EXPECT_EQ(5u, buf->write(BBBBB)); + EXPECT_EQ(4u, buf->write(BBBBB)); + + std::array<uint8_t, 12> tmp12; + EXPECT_TRUE(buf->read_all(tmp12)); + EXPECT_THAT(tmp12, testing::ElementsAre( + 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B')); +} + +TEST(Buffer, dynamic_resize) { + auto buf = modxml::sax::make_buffer(10, 1000); + + std::array<uint8_t, 30> tmp30; + for (uint8_t i = 0; i < 30; ++i) + tmp30[i] = i; + + EXPECT_TRUE(buf->write_all(tmp30)); + EXPECT_TRUE(buf->write_all(tmp30)); + + std::array<uint8_t, 60> tmp60; + EXPECT_TRUE(buf->read_all(tmp60)); + for (uint8_t i = 0; i < 60; ++i) + EXPECT_EQ(i % 30, tmp60[i]) << i; +} + +TEST(Buffer, dynamic_overalloc) { + // This test can fail, but in most configurations trying to allocate + // std::numeric_limits<std::size_t>::max() will fail. + auto buf = modxml::sax::make_buffer(10, std::numeric_limits<std::size_t>::max()); + EXPECT_FALSE(buf->wspan(10000).empty()); + EXPECT_TRUE(buf->wspan(std::numeric_limits<std::size_t>::max()).empty()); +} + +TEST_P(BufferTest, modify) { + auto buf = make_buffer(10); + + EXPECT_TRUE(buf->write_all(AAAAAAAAAA)); + + auto span = buf->mspan(5); + EXPECT_EQ(10u, span.size()); + auto len = std::min(static_cast<std::size_t>(5), span.size()); + for (uint8_t i = 0; i < len; ++i) + span[i] = 'C'; + + std::array<uint8_t, 10> tmp10; + EXPECT_TRUE(buf->read_all(tmp10)); + EXPECT_THAT(tmp10, testing::ElementsAre( + 'C', 'C', 'C', 'C', 'C', 'A', 'A', 'A', 'A', 'A')); +} + +TEST_P(BufferTest, uncommit) { + auto buf = make_buffer(10); + + EXPECT_TRUE(buf->write_all(BBBBB)); + + EXPECT_EQ(0u, buf->uncommit(0)); + + EXPECT_EQ(5u, buf->write(AAAAAAAAAA)); + + std::array<uint8_t, 2> tmp2; + EXPECT_TRUE(buf->read_all(tmp2)); + EXPECT_THAT(tmp2, testing::ElementsAre('B', 'B')); + + EXPECT_EQ(3u, buf->uncommit(3)); + std::array<uint8_t, 5> tmp5; + EXPECT_TRUE(buf->read_all(tmp5)); + EXPECT_THAT(tmp5, testing::ElementsAre('B', 'B', 'B', 'A', 'A')); + + EXPECT_EQ(0u, buf->uncommit(2)); +} + +TEST_P(BufferTest, uncommit_wrap) { + auto buf = make_buffer(10); + + EXPECT_TRUE(buf->write_all(AAAAAAAAAA)); + std::array<uint8_t, 5> tmp5; + EXPECT_TRUE(buf->read_all(tmp5)); + + EXPECT_TRUE(buf->write_all(BBBBB)); + + EXPECT_EQ(8u, buf->uncommit(8)); + std::array<uint8_t, 2> tmp2; + EXPECT_TRUE(buf->read_all(tmp2)); + EXPECT_THAT(tmp2, testing::ElementsAre('A', 'A')); +} + +INSTANTIATE_TEST_SUITE_P( + BufferTests, + BufferTest, + testing::Values(BufferType::FIXED, BufferType::DYNAMIC), + [](auto& info) { + switch (info.param) { + case BufferType::FIXED: + return "fixed"; + case BufferType::DYNAMIC: + return "dynamic"; + } + return ""; + } +); diff --git a/sax/tst/test_decoder.cc b/sax/tst/test_decoder.cc new file mode 100644 index 0000000..86f230b --- /dev/null +++ b/sax/tst/test_decoder.cc @@ -0,0 +1,242 @@ +#include "sax_decoder.hh" +#include "sax_decoder_factory.hh" +#include "sax_processor.hh" +#include "sax_delegate.hh" + +#include <memory> +#include <gtest/gtest.h> + +namespace { + +class TestDelegate : public modxml::sax::Delegate { + public: + ~TestDelegate() override = default; + + void empty_element(std::string_view name, + modxml::sax::Attributes const&) override { + EXPECT_EQ(name, "root"); + if (name == "root") { + EXPECT_FALSE(have_root_); + have_root_ = true; + } + } + + void error(std::string_view message) override { + have_error_ = true; + FAIL() << message; + } + + bool have_root() const { return have_root_; } + + bool have_error() const { return have_error_; } + + private: + bool have_root_{false}; + bool have_error_{false}; +}; + +bool process_all(modxml::sax::Processor& processor, + TestDelegate& delegate, + std::span<uint8_t const> data) { + std::size_t offset = 0; + while (offset < data.size()) { + auto consumed = processor.process(data, offset); + if (consumed == 0 || delegate.have_error()) + return false; + offset += consumed; + } + return true; +} + +} // namespace + +TEST(sax, decoder_utf8) { + auto delegate = std::make_shared<TestDelegate>(); + auto processor = modxml::sax::Processor::create(delegate); + std::string input = R"(<?xml version="1.0" encoding="utf-8"?><root />)"; + std::cerr << input << std::endl; + EXPECT_TRUE(process_all( + *processor.get(), + *delegate.get(), + std::span<uint8_t const>(reinterpret_cast<uint8_t const*>(input.data()), + input.size()))); + EXPECT_TRUE(delegate->have_root()); +} + +TEST(sax, decoder_utf8_bom) { + auto delegate = std::make_shared<TestDelegate>(); + auto processor = modxml::sax::Processor::create(delegate); + std::string input = + "\xef\xbb\xbf" R"(<?xml version="1.0" encoding="utf-8"?><root />)"; + std::cerr << input << std::endl; + EXPECT_TRUE(process_all( + *processor.get(), + *delegate.get(), + std::span<uint8_t const>(reinterpret_cast<uint8_t const*>(input.data()), + input.size()))); + EXPECT_TRUE(delegate->have_root()); +} + +TEST(sax, decoder_utf16) { + auto delegate = std::make_shared<TestDelegate>(); + auto processor = modxml::sax::Processor::create(delegate); + std::u16string input = uR"(<?xml version="1.0" encoding="utf-16"?><root />)"; + EXPECT_TRUE(process_all( + *processor.get(), + *delegate.get(), + std::span<uint8_t const>(reinterpret_cast<uint8_t const*>(input.data()), + input.size() * sizeof(char16_t)))); + EXPECT_TRUE(delegate->have_root()); +} + +TEST(sax, decoder_utf16be) { + auto delegate = std::make_shared<TestDelegate>(); + auto processor = modxml::sax::Processor::create(delegate); + std::u16string str = uR"(<?xml version="1.0" encoding="utf-16"?><root />)"; + std::vector<uint8_t> input; + for (char16_t c : str) { + input.push_back(c >> 8); + input.push_back(c & 0xff); + } + EXPECT_TRUE(process_all( + *processor.get(), + *delegate.get(), + std::span<uint8_t const>(input.data(), input.size()))); + EXPECT_TRUE(delegate->have_root()); +} + +TEST(sax, decoder_utf16le) { + auto delegate = std::make_shared<TestDelegate>(); + auto processor = modxml::sax::Processor::create(delegate); + std::u16string str = uR"(<?xml version="1.0" encoding="utf-16"?><root />)"; + std::vector<uint8_t> input; + for (char16_t c : str) { + input.push_back(c & 0xff); + input.push_back(c >> 8); + } + EXPECT_TRUE(process_all( + *processor.get(), + *delegate.get(), + std::span<uint8_t const>(input.data(), input.size()))); + EXPECT_TRUE(delegate->have_root()); +} + +TEST(sax, decoder_utf16be_bom) { + auto delegate = std::make_shared<TestDelegate>(); + auto processor = modxml::sax::Processor::create(delegate); + std::u16string str = + u"\ufffe" uR"(<?xml version="1.0" encoding="utf-16"?><root />)"; + std::vector<uint8_t> input; + for (char16_t c : str) { + input.push_back(c >> 8); + input.push_back(c & 0xff); + } + EXPECT_TRUE(process_all( + *processor.get(), + *delegate.get(), + std::span<uint8_t const>(input.data(), input.size()))); + EXPECT_TRUE(delegate->have_root()); +} + +TEST(sax, decoder_utf16le_bom) { + auto delegate = std::make_shared<TestDelegate>(); + auto processor = modxml::sax::Processor::create(delegate); + std::u16string str = + u"\ufffe" uR"(<?xml version="1.0" encoding="utf-16"?><root />)"; + std::vector<uint8_t> input; + for (char16_t c : str) { + input.push_back(c & 0xff); + input.push_back(c >> 8); + } + EXPECT_TRUE(process_all( + *processor.get(), + *delegate.get(), + std::span<uint8_t const>(input.data(), input.size()))); + EXPECT_TRUE(delegate->have_root()); +} + +TEST(sax, decoder_utf32) { + auto delegate = std::make_shared<TestDelegate>(); + auto processor = modxml::sax::Processor::create(delegate); + std::u32string input = UR"(<?xml version="1.0" encoding="utf-32"?><root />)"; + EXPECT_TRUE(process_all( + *processor.get(), + *delegate.get(), + std::span<uint8_t const>(reinterpret_cast<uint8_t const*>(input.data()), + input.size() * sizeof(char32_t)))); + EXPECT_TRUE(delegate->have_root()); +} + +TEST(sax, decoder_utf32be) { + auto delegate = std::make_shared<TestDelegate>(); + auto processor = modxml::sax::Processor::create(delegate); + std::u32string str = UR"(<?xml version="1.0" encoding="utf-32"?><root />)"; + std::vector<uint8_t> input; + for (char32_t c : str) { + input.push_back(c >> 24); + input.push_back((c >> 16) & 0xff); + input.push_back((c >> 8) & 0xff); + input.push_back(c & 0xff); + } + EXPECT_TRUE(process_all( + *processor.get(), + *delegate.get(), + std::span<uint8_t const>(input.data(), input.size()))); + EXPECT_TRUE(delegate->have_root()); +} + +TEST(sax, decoder_utf32le) { + auto delegate = std::make_shared<TestDelegate>(); + auto processor = modxml::sax::Processor::create(delegate); + std::u32string str = UR"(<?xml version="1.0" encoding="utf-32"?><root />)"; + std::vector<uint8_t> input; + for (char32_t c : str) { + input.push_back(c & 0xff); + input.push_back((c >> 8) & 0xff); + input.push_back((c >> 16) & 0xff); + input.push_back(c >> 24); + } + EXPECT_TRUE(process_all( + *processor.get(), + *delegate.get(), + std::span<uint8_t const>(input.data(), input.size()))); + EXPECT_TRUE(delegate->have_root()); +} + +TEST(sax, decoder_utf32be_bom) { + auto delegate = std::make_shared<TestDelegate>(); + auto processor = modxml::sax::Processor::create(delegate); + std::u32string str = + U"\ufffe" UR"(<?xml version="1.0" encoding="utf-32"?><root />)"; + std::vector<uint8_t> input; + for (char32_t c : str) { + input.push_back(c >> 24); + input.push_back((c >> 16) & 0xff); + input.push_back((c >> 8) & 0xff); + input.push_back(c & 0xff); + } + EXPECT_TRUE(process_all( + *processor.get(), + *delegate.get(), + std::span<uint8_t const>(input.data(), input.size()))); + EXPECT_TRUE(delegate->have_root()); +} + +TEST(sax, decoder_utf32le_bom) { + auto delegate = std::make_shared<TestDelegate>(); + auto processor = modxml::sax::Processor::create(delegate); + std::u32string str = + U"\ufffe" R"(<?xml version="1.0" encoding="utf-32"?><root />)"; + std::vector<uint8_t> input; + for (char32_t c : str) { + input.push_back(c & 0xff); + input.push_back((c >> 8) & 0xff); + input.push_back((c >> 16) & 0xff); + input.push_back(c >> 24); + } + EXPECT_TRUE(process_all( + *processor.get(), + *delegate.get(), + std::span<uint8_t const>(input.data(), input.size()))); + EXPECT_TRUE(delegate->have_root()); +} diff --git a/utf/inc/utf16.hh b/utf/inc/utf16.hh index 344b1a2..b9229bc 100644 --- a/utf/inc/utf16.hh +++ b/utf/inc/utf16.hh @@ -4,27 +4,29 @@ #include "macros.hh" #include <cstdint> -#include <string_view> +#include <span> namespace utf { -/* Read one unicode codepoint from UTF-16 BigEndian encoded data if possible. +/** + * Read one unicode codepoint from UTF-16 BigEndian encoded data if possible. * If successfull offset is incremented to point to next codepoint. * Will fail: * - not enough data is left in data given offset, returns NEED_MORE. * - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs, * returns INVALID. */ -uint32_t HIDDEN read16be(std::string_view data, std::size_t& offset); +uint32_t HIDDEN read16be(std::span<uint8_t const> data, std::size_t& offset); -/* Read one unicode codepoint from UTF-16 LittleEndian encoded data if possible. +/** + * Read one unicode codepoint from UTF-16 LittleEndian encoded data if possible. * If successfull offset is incremented to point to next codepoint. * Will fail: * - not enough data is left in data given offset, returns NEED_MORE. * - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs, * returns INVALID. */ -uint32_t HIDDEN read16le(std::string_view data, std::size_t& offset); +uint32_t HIDDEN read16le(std::span<uint8_t const> data, std::size_t& offset); } // namespace utf diff --git a/utf/inc/utf32.hh b/utf/inc/utf32.hh index 2d3088e..4ee5eac 100644 --- a/utf/inc/utf32.hh +++ b/utf/inc/utf32.hh @@ -4,25 +4,27 @@ #include "macros.hh" #include <cstdint> -#include <string_view> +#include <span> namespace utf { -/* Read one unicode codepoint from UTF-32 BigEndian encoded data if possible. +/** + * Read one unicode codepoint from UTF-32 BigEndian encoded data if possible. * If successfull offset is incremented to point to next codepoint. * Will fail: * - not enough data is left in data given offset, returns NEED_MORE. * - data is not valid UTF-32, ie. outside valid ranges, returns INVALID. */ -uint32_t HIDDEN read32be(std::string_view data, std::size_t& offset); +uint32_t HIDDEN read32be(std::span<uint8_t const> data, std::size_t& offset); -/* Read one unicode codepoint from UTF-32 LittleEndian encoded data if possible. +/** + * Read one unicode codepoint from UTF-32 LittleEndian encoded data if possible. * If successfull offset is incremented to point to next codepoint. * Will fail: * - not enough data is left in data given offset, returns NEED_MORE. * - data is not valid UTF-32, ie. outside valid ranges, returns INVALID. */ -uint32_t HIDDEN read32le(std::string_view data, std::size_t& offset); +uint32_t HIDDEN read32le(std::span<uint8_t const> data, std::size_t& offset); } // namespace utf diff --git a/utf/inc/utf8.hh b/utf/inc/utf8.hh index a3ea84a..7735ecd 100644 --- a/utf/inc/utf8.hh +++ b/utf/inc/utf8.hh @@ -4,18 +4,29 @@ #include "macros.hh" #include <cstdint> -#include <string_view> +#include <span> namespace utf { -/* Read one unicode codepoint from UTF-8 encoded data if possible. - * If successfull offset is incremented to point to next codepoint. +/** + * Read one unicode codepoint from UTF-8 encoded data if possible. + * If successful, offset is incremented to point to next codepoint. * Will fail: * - not enough data is left in data given offset, returns NEED_MORE. * - data is not valid UTF-8, this includes overlong encodings and * invalid unicode code points, returns INVALID. */ -uint32_t HIDDEN read8(std::string_view data, std::size_t& offset); +uint32_t HIDDEN read8(std::span<uint8_t const> data, std::size_t& offset); + +/** + * Write one unicode codepoint to UTF-8 encoded data if possible. + * If successful, offset is incremented to the end of the written data + * and true is returned. + * If not successful, offset is not incremented and false is returned. + * data is not modified. + */ +bool HIDDEN write8(uint32_t codepoint, std::span<uint8_t> data, + std::size_t& offset); } // namespace utf diff --git a/utf/meson.build b/utf/meson.build index 64db6ff..051ddd1 100644 --- a/utf/meson.build +++ b/utf/meson.build @@ -23,16 +23,16 @@ test('utf8', executable( 'test_utf8', sources: ['tst/test_utf8.cc'], - dependencies: [utf_dep, gtest_dep])) + dependencies: [utf_dep, gmock_dep, gtest_dep])) test('utf16', executable( 'test_utf16', sources: ['tst/test_utf16.cc'], - dependencies: [utf_dep, gtest_dep])) + dependencies: [utf_dep, gmock_dep, gtest_dep])) test('utf32', executable( 'test_utf32', sources: ['tst/test_utf32.cc'], - dependencies: [utf_dep, gtest_dep])) + dependencies: [utf_dep, gmock_dep, gtest_dep])) diff --git a/utf/src/utf16.cc b/utf/src/utf16.cc index 43595bf..623c1be 100644 --- a/utf/src/utf16.cc +++ b/utf/src/utf16.cc @@ -16,7 +16,7 @@ inline bool is_low_surrogate(uint16_t c) { } // namespace -uint32_t read16be(std::string_view data, std::size_t& offset) { +uint32_t read16be(std::span<uint8_t const> data, std::size_t& offset) { if (offset > data.size() || data.size() - offset < 2) return NEED_MORE; uint16_t c = static_cast<uint16_t>(data[offset]) << 8 @@ -40,7 +40,7 @@ uint32_t read16be(std::string_view data, std::size_t& offset) { return c; } -uint32_t read16le(std::string_view data, std::size_t& offset) { +uint32_t read16le(std::span<uint8_t const> data, std::size_t& offset) { if (offset > data.size() || data.size() - offset < 2) return NEED_MORE; uint16_t c = static_cast<uint16_t>(data[offset + 1]) << 8 diff --git a/utf/src/utf32.cc b/utf/src/utf32.cc index cfa29b6..e33b0b4 100644 --- a/utf/src/utf32.cc +++ b/utf/src/utf32.cc @@ -12,7 +12,7 @@ inline bool valid_codepoint(uint32_t c) { } // namespace -uint32_t read32be(std::string_view data, std::size_t& offset) { +uint32_t read32be(std::span<uint8_t const> data, std::size_t& offset) { if (offset > data.size() || data.size() - offset < 4) return NEED_MORE; uint32_t c = static_cast<uint32_t>(data[offset]) << 24 @@ -26,7 +26,7 @@ uint32_t read32be(std::string_view data, std::size_t& offset) { return INVALID; } -uint32_t read32le(std::string_view data, std::size_t& offset) { +uint32_t read32le(std::span<uint8_t const> data, std::size_t& offset) { if (offset > data.size() || data.size() - offset < 4) return NEED_MORE; uint32_t c = static_cast<uint32_t>(data[offset + 3]) << 24 diff --git a/utf/src/utf8.cc b/utf/src/utf8.cc index 54b0296..0e444ae 100644 --- a/utf/src/utf8.cc +++ b/utf/src/utf8.cc @@ -12,12 +12,12 @@ inline bool valid_codepoint(uint32_t c) { } // namespace -uint32_t read8(std::string_view data, std::size_t& offset) { +uint32_t read8(std::span<uint8_t const> data, std::size_t& offset) { if (offset >= data.size()) return NEED_MORE; uint32_t ret; uint8_t size; - switch (static_cast<uint8_t>(data[offset]) >> 4) { + switch (data[offset] >> 4) { case 15: if (data[offset] & 0x08) return INVALID; @@ -65,4 +65,35 @@ uint32_t read8(std::string_view data, std::size_t& offset) { return ret; } +bool write8(uint32_t codepoint, std::span<uint8_t> data, std::size_t& offset) { + if (offset >= data.size()) UNLIKELY { + return false; + } + if (codepoint < 0x80) { + data[offset++] = codepoint; + } else if (codepoint < 0x800) { + if (data.size() - offset < 2) UNLIKELY { + return false; + } + data[offset++] = 0xc0 | (codepoint >> 6); + data[offset++] = 0x80 | (codepoint & 0x3f); + } else if (codepoint < 0x10000) { + if (data.size() - offset < 3) UNLIKELY { + return false; + } + data[offset++] = 0xe0 | (codepoint >> 12); + data[offset++] = 0x80 | ((codepoint >> 6) & 0x3f); + data[offset++] = 0x80 | (codepoint & 0x3f); + } else { + if (data.size() - offset < 4) UNLIKELY { + return false; + } + data[offset++] = 0xf0 | (codepoint >> 18); + data[offset++] = 0x80 | ((codepoint >> 12) & 0x3f); + data[offset++] = 0x80 | ((codepoint >> 6) & 0x3f); + data[offset++] = 0x80 | (codepoint & 0x3f); + } + return true; +} + } // namespace utf diff --git a/utf/tst/test_utf16.cc b/utf/tst/test_utf16.cc index c17982e..3b3c03c 100644 --- a/utf/tst/test_utf16.cc +++ b/utf/tst/test_utf16.cc @@ -2,156 +2,137 @@ #include "utf_error.hh" +#include <array> #include <gtest/gtest.h> TEST(utf16be, sanity) { - std::string_view str("\x00\x24", 2); size_t offset = 0; - auto ret = utf::read16be(str, offset); + auto ret = utf::read16be(std::array<uint8_t, 2>({0x00, 0x24}), offset); EXPECT_EQ('$', ret); EXPECT_EQ(2, offset); - str = "\x20\xAC"; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array<uint8_t, 2>({0x20, 0xAC}), offset); EXPECT_EQ(0x20AC, ret); EXPECT_EQ(2, offset); - str = "\xD8\x01\xDC\x37"; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array<uint8_t, 4>({0xD8, 0x01, 0xDC, 0x37}), offset); EXPECT_EQ(0x10437, ret); EXPECT_EQ(4, offset); - str = "\xD8\x52\xDF\x62"; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array<uint8_t, 4>({0xD8, 0x52, 0xDF, 0x62}), offset); EXPECT_EQ(0x24B62, ret); EXPECT_EQ(4, offset); } TEST(utf16le, sanity) { - std::string_view str("\x24\x00", 2); size_t offset = 0; - auto ret = utf::read16le(str, offset); + auto ret = utf::read16le(std::array<uint8_t, 2>({0x24, 0x00}), offset); EXPECT_EQ('$', ret); EXPECT_EQ(2, offset); - str = "\xAC\x20"; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array<uint8_t, 2>({0xAC, 0x20}), offset); EXPECT_EQ(0x20AC, ret); EXPECT_EQ(2, offset); - str = "\x01\xD8\x37\xDC"; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array<uint8_t, 4>({0x01, 0xD8, 0x37, 0xDC}), offset); EXPECT_EQ(0x10437, ret); EXPECT_EQ(4, offset); - str = "\x52\xD8\x62\xDF"; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array<uint8_t, 4>({0x52, 0xD8, 0x62, 0xDF}), offset); EXPECT_EQ(0x24B62, ret); EXPECT_EQ(4, offset); } TEST(utf16be, bom) { - std::string_view str("\xFE\xFF\x20\xAC"); + std::array<uint8_t, 4> data({0xFE, 0xFF, 0x20, 0xAC}); size_t offset = 0; - auto ret = utf::read16be(str, offset); + auto ret = utf::read16be(data, offset); EXPECT_EQ(0xFEFF, ret); - ret = utf::read16be(str, offset); + ret = utf::read16be(data, offset); EXPECT_EQ(0x20AC, ret); - ret = utf::read16be(str, offset); + ret = utf::read16be(data, offset); EXPECT_EQ(utf::NEED_MORE, ret); - EXPECT_EQ(str.size(), offset); + EXPECT_EQ(data.size(), offset); } TEST(utf16le, bom) { - std::string_view str("\xFF\xFE\xAC\x20"); + std::array<uint8_t, 4> data({0xFF, 0xFE, 0xAC, 0x20}); size_t offset = 0; - auto ret = utf::read16le(str, offset); + auto ret = utf::read16le(data, offset); EXPECT_EQ(0xFEFF, ret); - ret = utf::read16le(str, offset); + ret = utf::read16le(data, offset); EXPECT_EQ(0x20AC, ret); - ret = utf::read16le(str, offset); + ret = utf::read16le(data, offset); EXPECT_EQ(utf::NEED_MORE, ret); - EXPECT_EQ(str.size(), offset); + EXPECT_EQ(data.size(), offset); } TEST(utf16be, invalid) { - std::string_view str("\xD8"); size_t offset = 0; - auto ret = utf::read16be(str, offset); + auto ret = utf::read16be(std::array<uint8_t, 1>({0xD8}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = ""; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array<uint8_t, 0>(), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\xD8\x01"; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array<uint8_t, 2>({0xD8, 0x01}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\xD8\x01\xDC"; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array<uint8_t, 3>({0xD8, 0x01, 0xDC}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\xDC\x37\xD8\x01"; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array<uint8_t, 4>({0xDC, 0x37, 0xD8, 0x01}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\xD8\x01\xD8\x01"; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array<uint8_t, 4>({0xD8, 0x01, 0xD8, 0x01}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); } TEST(utf16le, invalid) { - std::string_view str("\x01"); size_t offset = 0; - auto ret = utf::read16le(str, offset); + auto ret = utf::read16le(std::array<uint8_t, 1>({0x01}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = ""; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array<uint8_t, 0>(), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\x01\xD8"; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array<uint8_t, 2>({0x01, 0xD8}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\x01\xD8\x37"; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array<uint8_t, 3>({0x01, 0xD8, 0x37}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\x37\xDC\x01\xD8"; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array<uint8_t, 4>({0x37, 0xDC, 0x01, 0xD8}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\x01\xD8\x01\xD8"; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array<uint8_t, 4>({0x01, 0xD8, 0x01, 0xD8}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); } diff --git a/utf/tst/test_utf32.cc b/utf/tst/test_utf32.cc index 796b4cd..447b541 100644 --- a/utf/tst/test_utf32.cc +++ b/utf/tst/test_utf32.cc @@ -2,144 +2,137 @@ #include "utf_error.hh" +#include <array> #include <gtest/gtest.h> TEST(utf32be, sanity) { - std::string_view str("\x00\x00\x00\x24", 4); size_t offset = 0; - auto ret = utf::read32be(str, offset); + auto ret = utf::read32be( + std::array<uint8_t, 4>({0x00, 0x00, 0x00, 0x24}), offset); EXPECT_EQ('$', ret); EXPECT_EQ(4, offset); - str = std::string_view("\x00\x00\x20\xAC", 4); offset = 0; - ret = utf::read32be(str, offset); + ret = utf::read32be( + std::array<uint8_t, 4>({0x00, 0x00, 0x20, 0xAC}), offset); EXPECT_EQ(0x20AC, ret); EXPECT_EQ(4, offset); - str = std::string_view("\x00\x01\x04\x37", 4); offset = 0; - ret = utf::read32be(str, offset); + ret = utf::read32be( + std::array<uint8_t, 4>({0x00, 0x01, 0x04, 0x37}), offset); EXPECT_EQ(0x10437, ret); EXPECT_EQ(4, offset); } TEST(utf32le, sanity) { - std::string_view str("\x24\x00\x00\x00", 4); size_t offset = 0; - auto ret = utf::read32le(str, offset); + auto ret = utf::read32le( + std::array<uint8_t, 4>({0x24, 0x00, 0x00, 0x00}), offset); EXPECT_EQ('$', ret); EXPECT_EQ(4, offset); - str = std::string_view("\xAC\x20\x00\x00", 4); offset = 0; - ret = utf::read32le(str, offset); + ret = utf::read32le( + std::array<uint8_t, 4>({0xAC, 0x20, 0x00, 0x00}), offset); EXPECT_EQ(0x20AC, ret); EXPECT_EQ(4, offset); - str = std::string_view("\x37\x04\x01\x00", 4); offset = 0; - ret = utf::read32le(str, offset); + ret = utf::read32le( + std::array<uint8_t, 4>({0x37, 0x04, 0x01, 0x00}), offset); EXPECT_EQ(0x10437, ret); EXPECT_EQ(4, offset); } TEST(utf32be, invalid) { - std::string_view str("\xFF\xFF\xFF\xFF"); size_t offset = 0; - auto ret = utf::read32be(str, offset); + auto ret = utf::read32be( + std::array<uint8_t, 4>({0xFF, 0xFF, 0xFF, 0xFF}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00\x00\xD8\x00", 4); offset = 0; - ret = utf::read32be(str, offset); + ret = utf::read32be( + std::array<uint8_t, 4>({0x00, 0x00, 0xD8, 0x00}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = ""; offset = 0; - ret = utf::read32be(str, offset); + ret = utf::read32be(std::array<uint8_t, 1>({}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00", 1); offset = 0; - ret = utf::read32be(str, offset); + ret = utf::read32be(std::array<uint8_t, 1>({0x00}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00\x00", 2); offset = 0; - ret = utf::read32be(str, offset); + ret = utf::read32be(std::array<uint8_t, 2>({0x00, 0x00}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00\x00\x00", 3); offset = 0; - ret = utf::read32be(str, offset); + ret = utf::read32be(std::array<uint8_t, 3>({0x00, 0x00, 0x00}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); } TEST(utf32le, invalid) { - std::string_view str("\xFF\xFF\xFF\xFF"); size_t offset = 0; - auto ret = utf::read32le(str, offset); + auto ret = utf::read32le( + std::array<uint8_t, 4>({0xFF, 0xFF, 0xFF, 0xFF}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00\xD8\x00\x00", 4); offset = 0; - ret = utf::read32le(str, offset); + ret = utf::read32le( + std::array<uint8_t, 4>({0x00, 0xD8, 0x00, 0x00}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = ""; offset = 0; - ret = utf::read32le(str, offset); + ret = utf::read32le(std::array<uint8_t, 0>(), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00", 1); offset = 0; - ret = utf::read32le(str, offset); + ret = utf::read32le(std::array<uint8_t, 1>({0x00}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00\x00", 2); offset = 0; - ret = utf::read32le(str, offset); + ret = utf::read32le(std::array<uint8_t, 2>({0x00, 0x00}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00\x00\x00", 3); offset = 0; - ret = utf::read32le(str, offset); + ret = utf::read32le(std::array<uint8_t, 3>({0x00, 0x00, 0x00}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); } TEST(utf32be, bom) { - std::string_view str("\x00\x00\xFF\xFE\x00\x00\x20\xAC", 8); + std::array<uint8_t, 8> data({0x00, 0x00, 0xFF, 0xFE, 0x00, 0x00, 0x20, 0xAC}); size_t offset = 0; - auto ret = utf::read32be(str, offset); + auto ret = utf::read32be(data, offset); EXPECT_EQ(0xFFFE, ret); - ret = utf::read32be(str, offset); + ret = utf::read32be(data, offset); EXPECT_EQ(0x20AC, ret); - ret = utf::read32be(str, offset); + ret = utf::read32be(data, offset); EXPECT_EQ(utf::NEED_MORE, ret); - EXPECT_EQ(str.size(), offset); + EXPECT_EQ(data.size(), offset); } TEST(utf32le, bom) { - std::string_view str("\xFE\xFF\x00\x00\xAC\x20\x00\x00", 8); + std::array<uint8_t, 8> data({0xFE, 0xFF, 0x00, 0x00, 0xAC, 0x20, 0x00, 0x00}); size_t offset = 0; - auto ret = utf::read32le(str, offset); + auto ret = utf::read32le(data, offset); EXPECT_EQ(0xFFFE, ret); - ret = utf::read32le(str, offset); + ret = utf::read32le(data, offset); EXPECT_EQ(0x20AC, ret); - ret = utf::read32le(str, offset); + ret = utf::read32le(data, offset); EXPECT_EQ(utf::NEED_MORE, ret); - EXPECT_EQ(str.size(), offset); + EXPECT_EQ(data.size(), offset); } diff --git a/utf/tst/test_utf8.cc b/utf/tst/test_utf8.cc index 10df969..8bdeba4 100644 --- a/utf/tst/test_utf8.cc +++ b/utf/tst/test_utf8.cc @@ -2,187 +2,245 @@ #include "utf_error.hh" +#include <array> +#include <gmock/gmock.h> #include <gtest/gtest.h> +#include <span> -TEST(utf8, sanity) { - std::string_view str("$"); +TEST(utf8, read_sanity) { size_t offset = 0; - auto ret = utf::read8(str, offset); + auto ret = utf::read8(std::array<uint8_t, 1>({'$'}), offset); EXPECT_EQ('$', ret); EXPECT_EQ(1, offset); - str = "\xC2\xA3"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 2>({0xC2, 0xA3}), offset); EXPECT_EQ(0xa3, ret); EXPECT_EQ(2, offset); - str = "\xD0\x98"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 2>({0xD0, 0x98}), offset); EXPECT_EQ(0x418, ret); EXPECT_EQ(2, offset); - str = "\xE0\xA4\xB9"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 3>({0xE0, 0xA4, 0xB9}), offset); EXPECT_EQ(0x939, ret); EXPECT_EQ(3, offset); - str = "\xE2\x82\xAC"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 3>({0xE2, 0x82, 0xAC}), offset); EXPECT_EQ(0x20AC, ret); EXPECT_EQ(3, offset); - str = "\xED\x95\x9C"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 3>({0xED, 0x95, 0x9C}), offset); EXPECT_EQ(0xD55C, ret); EXPECT_EQ(3, offset); - str = "\xF0\x90\x8D\x88"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 4>({0xF0, 0x90, 0x8D, 0x88}), offset); EXPECT_EQ(0x10348, ret); EXPECT_EQ(4, offset); } -TEST(utf8, overlong) { - std::string_view str("\xF0\x82\x82\xAC"); +TEST(utf8, write_sanity) { + std::array<uint8_t, 10> out; size_t offset = 0; - auto ret = utf::read8(str, offset); + EXPECT_TRUE(utf::write8('$', out, offset)); + EXPECT_THAT(std::span(out).subspan(0, offset), + testing::ElementsAre('$')); + EXPECT_EQ(1, offset); + + offset = 0; + EXPECT_TRUE(utf::write8(0xa3, out, offset)); + EXPECT_THAT(std::span(out).subspan(0, offset), + testing::ElementsAre(0xC2, 0xA3)); + EXPECT_EQ(2, offset); + + offset = 0; + EXPECT_TRUE(utf::write8(0x418, out, offset)); + EXPECT_THAT(std::span(out).subspan(0, offset), + testing::ElementsAre(0xD0, 0x98)); + EXPECT_EQ(2, offset); + + offset = 0; + EXPECT_TRUE(utf::write8(0x939, out, offset)); + EXPECT_THAT(std::span(out).subspan(0, offset), + testing::ElementsAre(0xE0, 0xA4, 0xB9)); + EXPECT_EQ(3, offset); + + offset = 0; + EXPECT_TRUE(utf::write8(0x20AC, out, offset)); + EXPECT_THAT(std::span(out).subspan(0, offset), + testing::ElementsAre(0xE2, 0x82, 0xAC)); + EXPECT_EQ(3, offset); + + offset = 0; + EXPECT_TRUE(utf::write8(0xD55C, out, offset)); + EXPECT_THAT(std::span(out).subspan(0, offset), + testing::ElementsAre(0xED, 0x95, 0x9C)); + EXPECT_EQ(3, offset); + + offset = 0; + EXPECT_TRUE(utf::write8(0x10348, out, offset)); + EXPECT_THAT(std::span(out).subspan(0, offset), + testing::ElementsAre(0xF0, 0x90, 0x8D, 0x88)); + EXPECT_EQ(4, offset); +} + +TEST(utf8, read_overlong) { + size_t offset = 0; + auto ret = utf::read8( + std::array<uint8_t, 4>({0xF0, 0x82, 0x82, 0xAC}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\xE0\x81\x81"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 3>({0xE0, 0x81, 0x81}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\xC0\x80"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 2>({0xC0, 0x80}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); } -TEST(utf8, invalid) { - std::string_view str("\xED\xB0\x80"); +TEST(utf8, read_invalid) { size_t offset = 0; - auto ret = utf::read8(str, offset); + auto ret = utf::read8(std::array<uint8_t, 3>({0xED, 0xB0, 0x80}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\xFB\xFF\xFF"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 3>({0xFB, 0xFF, 0xFF}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\xFF\xFF\xFF\xFF\xFF"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8( + std::array<uint8_t, 5>({0xFF, 0xFF, 0xFF, 0xFF, 0xFF}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = ""; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 0>(), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\x80"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 1>({0x80}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\xC2"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 1>({0xC2}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\xC2\x03"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 2>({0xC2, 0x03}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\xE0\xA4"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 2>({0xE0, 0xA4}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\xF0\x90\x8D"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array<uint8_t, 3>({0xF0, 0x90, 0x8D}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); } -TEST(utf8, multiple1) { - std::string_view str("\x4D\xC3\xAC\x6E\x68\x20\x6E\xC3\xB3\x69\x20\x74\x69" - "\xE1\xBA\xBF\x6E\x67\x20\x56\x69\xE1\xBB\x87\x74"); +TEST(utf8, read_multiple1) { + std::array<uint8_t, 25> data({ + 0x4D, 0xC3, 0xAC, 0x6E, 0x68, 0x20, 0x6E, 0xC3, 0xB3, 0x69, + 0x20, 0x74, 0x69, 0xE1, 0xBA, 0xBF, 0x6E, 0x67, 0x20, 0x56, + 0x69, 0xE1, 0xBB, 0x87, 0x74 + }); size_t offset = 0; - auto ret = utf::read8(str, offset); + auto ret = utf::read8(data, offset); EXPECT_EQ('M', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(0xEC, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('n', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('h', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(' ', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('n', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(0xF3, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('i', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(' ', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('t', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('i', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(0x1EBF, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('n', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('g', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(' ', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('V', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('i', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(0x1EC7, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('t', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(utf::NEED_MORE, ret); - EXPECT_EQ(str.size(), offset); + EXPECT_EQ(data.size(), offset); } -TEST(utf8, multiple2) { - std::string_view str("\xF0\xA8\x89\x9F\xE5\x91\x90\xE3\x97\x82\xE8\xB6\x8A"); +TEST(utf8, read_multiple2) { + std::array<uint8_t, 13> data({ + 0xF0, 0xA8, 0x89, 0x9F, 0xE5, 0x91, 0x90, 0xE3, 0x97, 0x82, + 0xE8, 0xB6, 0x8A, + }); size_t offset = 0; - auto ret = utf::read8(str, offset); + auto ret = utf::read8(data, offset); EXPECT_EQ(0x2825F, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(0x5450, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(0x35C2, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(0x8D8A, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(utf::NEED_MORE, ret); - EXPECT_EQ(str.size(), offset); + EXPECT_EQ(data.size(), offset); +} + +TEST(utf8, write_no_space) { + std::array<uint8_t, 10> data; + std::span<uint8_t> out(data); + size_t offset = 0; + EXPECT_FALSE(utf::write8('$', out.subspan(0, 0), offset)); + EXPECT_EQ(0u, offset); + + EXPECT_FALSE(utf::write8(0xa3, out.subspan(0, 1), offset)); + EXPECT_EQ(0u, offset); + EXPECT_FALSE(utf::write8(0x418, out.subspan(0, 0), offset)); + EXPECT_EQ(0u, offset); + + EXPECT_FALSE(utf::write8(0x939, out.subspan(0, 2), offset)); + EXPECT_EQ(0u, offset); + EXPECT_FALSE(utf::write8(0x20AC, out.subspan(0, 0), offset)); + EXPECT_EQ(0u, offset); + + EXPECT_FALSE(utf::write8(0x10348, out.subspan(0, 3), offset)); + EXPECT_EQ(0u, offset); } |
