summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@spawned.biz>2024-01-21 12:31:30 +0100
committerJoel Klinghed <the_jk@spawned.biz>2024-01-21 12:31:30 +0100
commit7dd49c6293172b494c78918507242cdb55d35137 (patch)
tree9c8ab822ab9501a5ea2f937e609144e00ea091c4
parentfc4547b412e28164af1bf8981234c6af959ccc0b (diff)
WIP
-rw-r--r--base/meson.build1
-rw-r--r--meson.build2
-rw-r--r--sax/inc/sax_attributes.hh146
-rw-r--r--sax/inc/sax_decoder.hh26
-rw-r--r--sax/inc/sax_decoder_factory.hh4
-rw-r--r--sax/inc/sax_delegate.hh22
-rw-r--r--sax/inc/sax_processor.hh18
-rw-r--r--sax/inc/sax_processor_builder.hh2
-rw-r--r--sax/meson.build20
-rw-r--r--sax/src/buffer.cc398
-rw-r--r--sax/src/buffer.hh108
-rw-r--r--sax/src/decoder.cc308
-rw-r--r--sax/src/guessing_decoder.cc92
-rw-r--r--sax/src/guessing_decoder.hh21
-rw-r--r--sax/src/sax_attributes.cc38
-rw-r--r--sax/src/sax_delegate.cc21
-rw-r--r--sax/src/sax_processor.cc1098
-rw-r--r--sax/src/utils.cc37
-rw-r--r--sax/src/utils.hh4
-rw-r--r--sax/tst/test_buffer.cc272
-rw-r--r--sax/tst/test_decoder.cc242
-rw-r--r--utf/inc/utf16.hh12
-rw-r--r--utf/inc/utf32.hh12
-rw-r--r--utf/inc/utf8.hh19
-rw-r--r--utf/meson.build6
-rw-r--r--utf/src/utf16.cc4
-rw-r--r--utf/src/utf32.cc4
-rw-r--r--utf/src/utf8.cc35
-rw-r--r--utf/tst/test_utf16.cc81
-rw-r--r--utf/tst/test_utf32.cc85
-rw-r--r--utf/tst/test_utf8.cc204
31 files changed, 2928 insertions, 414 deletions
diff --git a/base/meson.build b/base/meson.build
index 71faace..7668487 100644
--- a/base/meson.build
+++ b/base/meson.build
@@ -8,7 +8,6 @@ if cpp.compiles('''int foo() {
return 0;
}''', name: 'C++20 unlikely attribute')
cpp_flags += '-DHAVE_ATTRIBUTE_UNLIKELY'
- cpp_flags += '-Wno-c++20-attribute-extensions'
endif
inc = include_directories('inc')
diff --git a/meson.build b/meson.build
index 2d571dc..d8a9641 100644
--- a/meson.build
+++ b/meson.build
@@ -4,7 +4,7 @@ project(
meson_version: '>= 0.58',
default_options : [
'warning_level=3',
- 'cpp_std=c++17',
+ 'cpp_std=c++20',
'cpp_rtti=false',
'cpp_eh=none',
'b_ndebug=if-release',
diff --git a/sax/inc/sax_attributes.hh b/sax/inc/sax_attributes.hh
new file mode 100644
index 0000000..4ab1a44
--- /dev/null
+++ b/sax/inc/sax_attributes.hh
@@ -0,0 +1,146 @@
+#ifndef SAX_ATTRIBUTES_HH
+#define SAX_ATTRIBUTES_HH
+
+#include <iterator>
+#include <optional>
+#include <string_view>
+
+namespace modxml {
+namespace sax {
+
+struct Attribute {
+ std::string_view name;
+ std::string_view value;
+
+ Attribute(std::string_view name, std::string_view value);
+};
+
+/**
+ * A view of attributes, with utility functions.
+ */
+class Attributes {
+ public:
+ virtual ~Attributes() = default;
+
+ class iterator {
+ public:
+ using iterator_category = std::random_access_iterator_tag;
+ using difference_type = std::ptrdiff_t;
+ using element_type = Attribute;
+ using pointer = element_type const *;
+ using reference = element_type const &;
+
+ iterator()
+ : attributes_(nullptr), index_(0) {}
+ iterator(iterator const& it)
+ : attributes_(it.attributes_), index_(it.index_) {}
+ iterator& operator=(iterator const& it) {
+ attributes_ = it.attributes_;
+ index_ = it.index_;
+ return *this;
+ }
+
+ /**
+ * Comparing two iterators from different Attributes instances is undefined.
+ */
+ bool operator==(iterator const& it) const {
+ return index_ == it.index_;
+ }
+ std::strong_ordering operator<=>(iterator const& it) const {
+ return index_ <=> it.index_;
+ }
+
+ pointer operator->() const { return &attributes_->at(index_); }
+ reference operator*() const { return attributes_->at(index_); }
+ reference operator[](difference_type i) const {
+ return attributes_->at(index_ + i);
+ }
+
+ iterator& operator++() {
+ ++index_;
+ return *this;
+ }
+ iterator operator++(int) {
+ auto ret = *this;
+ ++index_;
+ return ret;
+ }
+ iterator& operator+=(difference_type i) {
+ index_ += i;
+ return *this;
+ }
+ iterator operator+(difference_type i) const {
+ return iterator(attributes_, index_ + i);
+ }
+ friend iterator operator+(difference_type i, iterator const &it) {
+ return iterator(it.attributes_, it.index_ + i);
+ }
+ iterator& operator--() {
+ --index_;
+ return *this;
+ }
+ iterator operator--(int) {
+ auto ret = *this;
+ --index_;
+ return ret;
+ }
+ iterator& operator-=(difference_type i) {
+ index_ -= i;
+ return *this;
+ }
+ difference_type operator-(iterator const& it) const {
+ return index_ - it.index_;
+ }
+ iterator operator-(difference_type i) const {
+ return iterator(attributes_, index_ - i);
+ }
+
+ protected:
+ iterator(Attributes const* attributes, std::size_t index)
+ : attributes_(attributes), index_(index) {}
+
+ private:
+ Attributes const* attributes_;
+ std::size_t index_;
+ };
+
+ static_assert(std::random_access_iterator<iterator>);
+
+ virtual iterator begin() const = 0;
+ virtual iterator end() const = 0;
+
+ virtual std::size_t size() const = 0;
+ /**
+ * name and value of attribute are valid as long as Attributes instance is.
+ */
+ virtual Attribute const& at(std::size_t index) const = 0;
+
+ Attribute const& operator[](std::size_t index) const { return at(index); }
+
+ /**
+ * Return the first attribute with name, if any.
+ */
+ virtual std::optional<std::string_view> find_first(
+ std::string_view name) const;
+
+ /**
+ * Return the last attribute with name, if any.
+ */
+ virtual std::optional<std::string_view> find_last(
+ std::string_view name) const;
+
+ /**
+ * Return the index of the attribute with name, starting with offset.
+ */
+ virtual std::optional<std::size_t> find(std::string_view name,
+ std::size_t index = 0) const;
+
+ protected:
+ Attributes() = default;
+};
+
+} // namespace sax
+} // namespace modxml
+
+
+#endif // SAX_ATTRIBUTES_HH
diff --git a/sax/inc/sax_decoder.hh b/sax/inc/sax_decoder.hh
index 40a56c9..8b2490c 100644
--- a/sax/inc/sax_decoder.hh
+++ b/sax/inc/sax_decoder.hh
@@ -1,16 +1,15 @@
#ifndef SAX_DECODER_HH
#define SAX_DECODER_HH
-#include <memory>
-#include <string>
-#include <string_view>
+#include <cstdint>
+#include <span>
namespace modxml {
namespace sax {
/**
* Decoder returned by DecoderFactory. Used by Processor to turn bytes into
- * unicode characters.
+ * unicode characters encoded as UTF-8.
*/
class Decoder {
public:
@@ -18,9 +17,9 @@ class Decoder {
enum class State {
GOOD = 0,
- // too little data was given to advance
+ // too little data was given to decode
NEED_MORE,
- // invalid data was given to advance
+ // invalid data was given to decode
INVALID,
};
@@ -29,23 +28,22 @@ class Decoder {
* write them to out (start at out_offset) as UTF-8.
* All written code points must be valid per Unicode, so inside the
* range U+0 to U+10FFFF and not a surrogate pair (U+D800-U+DFFF).
- * No partial output, only write to out if the whole UTF-8 sequence is
- * going to fit.
- * The is always at least 4 bytes available (out.size() - out_offset) when
+ * No partial code point output, only write to out if the whole UTF-8
+ * sequence for the code point is going to fit.
+ * There will always at least 4 bytes available (out.size() - out_offset) when
* called.
- * Advance in_offset for data consumed.
+ * Advance in_offset for data consumed. Do NOT read past in.size().
* Advance out_offset for code points written. Do NOT write past out.size().
- * Do NOT resize out.
* If at least one code point is decoded and written to out, return GOOD.
* If it is not possible to decode a single code point, in_offset and
* out_offset should not be advanced and something other than GOOD returned.
* Do not keep any references to any of the parameters after returning, next
- * advance() call will point to the following bytes, but all parameters
+ * decode() call will point to the following bytes, but all parameters
* may have changed as they are subject to the buffer implementations of the
* Processor.
*/
- virtual State decode(std::string_view in, std::size_t& in_offset,
- std::string& out, std::size_t& out_offset) = 0;
+ virtual State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+ std::span<uint8_t> out, std::size_t& out_offset) = 0;
protected:
Decoder() = default;
diff --git a/sax/inc/sax_decoder_factory.hh b/sax/inc/sax_decoder_factory.hh
index 80f1af3..2361ac3 100644
--- a/sax/inc/sax_decoder_factory.hh
+++ b/sax/inc/sax_decoder_factory.hh
@@ -2,7 +2,7 @@
#define SAX_DECODER_FACTORY_HH
#include <memory>
-#include <string>
+#include <string_view>
namespace modxml {
namespace sax {
@@ -23,7 +23,7 @@ class DecoderFactory {
* Note that encoding value isn't cleaned up or validated in any way, it is
* reported EXACTLY as found (even if not valid per XML spec).
*/
- virtual std::unique_ptr<Decoder> create(std::string const& encoding) = 0;
+ virtual std::unique_ptr<Decoder> create(std::string_view encoding) = 0;
protected:
DecoderFactory() = default;
diff --git a/sax/inc/sax_delegate.hh b/sax/inc/sax_delegate.hh
index ba63e72..59af2b7 100644
--- a/sax/inc/sax_delegate.hh
+++ b/sax/inc/sax_delegate.hh
@@ -1,9 +1,14 @@
#ifndef MODXML_SAX_DELEGATE_HH
#define MODXML_SAX_DELEGATE_HH
+#include <cstdint>
+#include <string_view>
+
namespace modxml {
namespace sax {
+class Attributes;
+
/**
* Delegate for processor.
* Implement to handle events.
@@ -12,6 +17,23 @@ class Delegate {
public:
virtual ~Delegate() = default;
+ virtual void start_element(std::string_view name,
+ Attributes const& attributes);
+
+ virtual void end_element(std::string_view name);
+
+ virtual void empty_element(std::string_view name,
+ Attributes const& attributes);
+
+ virtual void character_data(std::string_view data);
+
+ virtual void processing_instruction(std::string_view target,
+ std::string_view data);
+
+ virtual void comment(std::string_view data);
+
+ virtual void error(std::string_view message);
+
protected:
Delegate() = default;
};
diff --git a/sax/inc/sax_processor.hh b/sax/inc/sax_processor.hh
index 7ca32f7..cf53807 100644
--- a/sax/inc/sax_processor.hh
+++ b/sax/inc/sax_processor.hh
@@ -2,6 +2,7 @@
#define MODXML_SAX_PROCESSOR_HH
#include <memory>
+#include <span>
namespace modxml {
namespace sax {
@@ -23,6 +24,23 @@ class Processor {
*/
static std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate);
+ /**
+ * Process (consume) input data.
+ * Returns bytes consumed, can be zero.
+ */
+ virtual std::size_t process(std::span<uint8_t const> data,
+ std::size_t offset = 0) = 0;
+
+ /**
+ * When called from delegate, points to the start of the element that
+ * triggered the callback.
+ * When called otherwise, points to the last element that was processed.
+ * Lines start at 1.
+ * Columns start at 0.
+ */
+ virtual uint64_t line() const = 0;
+ virtual uint64_t column() const = 0;
+
protected:
Processor() = default;
diff --git a/sax/inc/sax_processor_builder.hh b/sax/inc/sax_processor_builder.hh
index 070fbbf..8b114e4 100644
--- a/sax/inc/sax_processor_builder.hh
+++ b/sax/inc/sax_processor_builder.hh
@@ -48,7 +48,7 @@ class ProcessorBuilder {
* If you give a too small buffer size (such as zero) it will be ignored
* and a implementation specific minimum will be used instead.
* This is meant as a possible optimization and can be completely ignored.
- * Note that the processor will allocate more data if it needed.
+ * Note that the processor will allocate more data if it needs to.
*/
virtual ProcessorBuilder* set_default_buffer_size(std::size_t size) = 0;
diff --git a/sax/meson.build b/sax/meson.build
index ccbdef4..8797c41 100644
--- a/sax/meson.build
+++ b/sax/meson.build
@@ -6,7 +6,11 @@ deps = [
inc = include_directories('inc')
lib = shared_library(
'modxmlsax',
+ 'src/buffer.cc',
'src/decoder.cc',
+ 'src/guessing_decoder.cc',
+ 'src/sax_attributes.cc',
+ 'src/sax_delegate.cc',
'src/sax_processor.cc',
'src/sax_processor_builder.cc',
'src/utils.cc',
@@ -20,3 +24,19 @@ sax_dep = declare_dependency(
include_directories: inc,
link_with: lib,
)
+
+test('buffer',
+ executable(
+ 'test_buffer',
+ sources: [
+ 'src/buffer.cc',
+ 'tst/test_buffer.cc',
+ ],
+ include_directories: 'src',
+ dependencies: [base_dep, gmock_dep, gtest_dep]))
+
+test('decoder',
+ executable(
+ 'test_decoder',
+ sources: ['tst/test_decoder.cc'],
+ dependencies: [sax_dep, gtest_dep]))
diff --git a/sax/src/buffer.cc b/sax/src/buffer.cc
new file mode 100644
index 0000000..964865d
--- /dev/null
+++ b/sax/src/buffer.cc
@@ -0,0 +1,398 @@
+#include "buffer.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <limits>
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+class DynamicBuffer : public Buffer {
+ public:
+ DynamicBuffer(std::size_t default_size, std::size_t max_size)
+ : default_size_(std::min(default_size, max_size)), max_size_(max_size),
+ data_(std::make_unique_for_overwrite<uint8_t[]>(default_size_)),
+ size_(default_size_) {}
+
+ std::span<uint8_t> wspan(std::size_t need) override {
+ auto avail = size_ - (offset_ + fill_);
+ if (need > avail) {
+ if (max_size_ - fill_ < need) // Early exit if need is never possible
+ return {};
+ if (offset_ > 0) {
+ std::copy_n(data_.get() + offset_, fill_, data_.get());
+ offset_ = 0;
+ }
+ avail = size_ - fill_;
+ if (need > avail) {
+ auto const max = std::numeric_limits<std::size_t>::max() / 2;
+ std::size_t new_size = size_;
+ while (true) {
+ if (new_size <= max) {
+ new_size *= 2;
+ } else {
+ new_size = std::numeric_limits<std::size_t>::max();
+ }
+ if (new_size >= max_size_) {
+ new_size = max_size_;
+ break;
+ }
+ if (new_size - fill_ >= need)
+ break;
+ }
+ // Using new as it has std::nothrow which make_unique lacks.
+ // Easy enought to keep track of the pointers here anyway.
+ auto* tmp = new(std::nothrow) uint8_t[new_size];
+ if (tmp == nullptr)
+ return {};
+ std::copy_n(data_.get(), fill_, tmp);
+ size_ = new_size;
+ data_.reset(tmp);
+ }
+ }
+ return {data_.get() + offset_ + fill_, size_ - (offset_ + fill_)};
+ }
+
+ void commit(std::size_t size) override {
+ assert(size_ - (offset_ + fill_) >= size);
+ fill_ += size;
+ }
+
+ std::span<uint8_t const> rspan(std::size_t) override {
+ return {data_.get() + offset_, fill_};
+ }
+
+ void consume(std::size_t size) override {
+ if (size == 0)
+ return;
+ assert(fill_ >= size);
+ fill_ -= size;
+ if (fill_ == 0) {
+ reset();
+ } else {
+ offset_ += size;
+ }
+ }
+
+ std::span<uint8_t> mspan(std::size_t) override {
+ return {data_.get() + offset_, fill_};
+ }
+
+ std::size_t uncommit(std::size_t size) override {
+ auto ret = std::min(size, fill_);
+ fill_ -= ret;
+ if (fill_ == 0) {
+ reset();
+ }
+ return ret;
+ }
+
+ bool empty() const override {
+ return fill_ == 0;
+ }
+
+ bool full() const override {
+ return fill_ >= max_size_;
+ }
+
+ void reset() override {
+ if (size_ != default_size_)
+ data_ = std::make_unique_for_overwrite<uint8_t[]>(size_ = default_size_);
+ offset_ = 0;
+ fill_ = 0;
+ }
+
+ private:
+ std::size_t const default_size_;
+ std::size_t const max_size_;
+ std::unique_ptr<uint8_t[]> data_;
+ std::size_t size_;
+ std::size_t offset_{0};
+ std::size_t fill_{0};
+};
+
+class FixedBuffer : public Buffer {
+ public:
+ explicit FixedBuffer(std::size_t size)
+ : size_(size), data_(std::make_unique<uint8_t[]>(size_)) {}
+
+ std::span<uint8_t> wspan(std::size_t need) override {
+ auto avail = wavail();
+ if (need > avail) {
+ if (need > size_ - ravail()) // Early exit if need will never fit
+ return {};
+ if (rptr_ < wptr_ || (rptr_ == wptr_ && !full_)) {
+ rotate();
+ avail = wavail();
+ } else {
+ return {};
+ }
+ }
+ return {data_.get() + wptr_, avail};
+ }
+
+ void commit(std::size_t size) override {
+ if (size == 0)
+ return;
+ assert(wavail() >= size);
+ wptr_ += size;
+ if (wptr_ == size_)
+ wptr_ = 0;
+ if (rptr_ == wptr_)
+ full_ = true;
+ }
+
+ std::span<uint8_t const> rspan(std::size_t want) override {
+ return mspan(want);
+ }
+
+ void consume(std::size_t size) override {
+ if (size == 0)
+ return;
+ assert(ravail() >= size);
+ full_ = false;
+ rptr_ += size;
+ if (rptr_ == size_)
+ rptr_ = 0;
+ if (rptr_ == wptr_)
+ reset();
+ }
+
+ std::span<uint8_t> mspan(std::size_t want) override {
+ auto avail = ravail();
+ if (want > avail) {
+ if (rptr_ > wptr_ || (rptr_ == wptr_ && full_)) {
+ rotate();
+ avail = ravail();
+ }
+ }
+ return {data_.get() + rptr_, avail};
+ }
+
+ std::size_t uncommit(std::size_t size) override {
+ if (size == 0)
+ return 0;
+ auto ret = do_uncommit(size);
+ if (ret < size) {
+ ret += do_uncommit(size - ret);
+ }
+ return ret;
+ }
+
+ bool empty() const override {
+ return rptr_ == wptr_ && !full_;
+ }
+
+ bool full() const override {
+ return rptr_ == wptr_ && full_;
+ }
+
+ void reset() override {
+ rptr_ = 0;
+ wptr_ = 0;
+ full_ = false;
+ }
+
+ private:
+ std::size_t ravail() const {
+ if (rptr_ < wptr_)
+ return wptr_ - rptr_;
+ if (rptr_ == wptr_ && !full_)
+ return 0;
+ return size_ - rptr_;
+ }
+
+ std::size_t wavail() const {
+ if (rptr_ > wptr_)
+ return rptr_ - wptr_;
+ if (rptr_ == wptr_ && full_)
+ return 0;
+ return size_ - wptr_;
+ }
+
+ std::size_t do_uncommit(std::size_t size) {
+ if (size == 0 || (rptr_ == wptr_ && !full_))
+ return 0;
+
+ full_ = false;
+
+ if (wptr_ == 0)
+ wptr_ = size_;
+
+ auto avail = rptr_ < wptr_ ? wptr_ - rptr_ : wptr_;
+ avail = std::min(avail, size);
+ wptr_ -= avail;
+ return avail;
+ }
+
+ void rotate() {
+ assert(rptr_ > 0);
+
+ if (rptr_ < wptr_) {
+ std::copy(data_.get() + rptr_, data_.get() + wptr_, data_.get());
+ wptr_ -= rptr_;
+ rptr_ = 0;
+ } else if (wptr_ < rptr_ || (wptr_ == rptr_ && full_)) {
+ auto left = wptr_;
+ auto right = size_ - rptr_;
+ // TODO: Can we do this without allocations?
+ if (left <= right) {
+ auto tmp = std::make_unique<uint8_t[]>(left);
+ std::copy_n(data_.get(), left, tmp.get());
+ std::copy_n(data_.get() + rptr_, right, data_.get());
+ std::copy_n(tmp.get(), left, data_.get() + right);
+ } else {
+ auto tmp = std::make_unique<uint8_t[]>(right);
+ std::copy_n(data_.get() + rptr_, right, tmp.get());
+ std::copy_backward(data_.get(), data_.get() + left,
+ data_.get() + left + right - 1);
+ std::copy_n(tmp.get(), right, data_.get());
+ }
+ wptr_ = left + right;
+ if (wptr_ == size_)
+ wptr_ = 0;
+ rptr_ = 0;
+ } else {
+ assert(false);
+ }
+ }
+
+ std::size_t const size_;
+ std::unique_ptr<uint8_t[]> data_;
+ std::size_t rptr_{0};
+ std::size_t wptr_{0};
+ bool full_{false};
+};
+
+class ReadViewBufferImpl : public ReadViewBuffer {
+ public:
+ explicit ReadViewBufferImpl(std::unique_ptr<Buffer> buffer)
+ : buffer_(std::move(buffer)) {}
+
+ std::size_t consumed() const override {
+ return offset_;
+ }
+
+ std::unique_ptr<Buffer> release() override {
+ return std::move(buffer_);
+ }
+
+ std::span<uint8_t> wspan(std::size_t need) override {
+ return buffer_->wspan(need);
+ }
+
+ void commit(std::size_t size) override {
+ return buffer_->commit(size);
+ }
+
+ std::span<uint8_t const> rspan(std::size_t want) override {
+ auto ret = buffer_->rspan(offset_ + want);
+ if (ret.size() <= offset_)
+ return ret.subspan(0, 0);
+ return ret.subspan(offset_, ret.size() - offset_);
+ }
+
+ void consume(std::size_t size) override {
+ offset_ += size;
+ }
+
+ std::span<uint8_t> mspan(std::size_t want) override {
+ auto ret = buffer_->mspan(offset_ + want);
+ if (ret.size() <= offset_)
+ return ret.subspan(0, 0);
+ return ret.subspan(offset_, ret.size() - offset_);
+ }
+
+ std::size_t uncommit(std::size_t size) override {
+ return buffer_->uncommit(size);
+ }
+
+ bool empty() const override {
+ if (buffer_->empty())
+ return true;
+ auto data = buffer_->rspan(offset_ + 1);
+ return data.size() <= offset_;
+ }
+
+ bool full() const override {
+ return buffer_->full();
+ }
+
+ void reset() override {
+ offset_ = 0;
+ }
+
+ private:
+ std::unique_ptr<Buffer> buffer_;
+ std::size_t offset_{0};
+};
+
+} // namespace
+
+std::unique_ptr<Buffer> make_buffer(std::size_t default_size,
+ std::size_t max_size) {
+ if (default_size >= max_size)
+ return std::make_unique<FixedBuffer>(max_size);
+
+ return std::make_unique<DynamicBuffer>(default_size, max_size);
+}
+
+std::unique_ptr<ReadViewBuffer> make_read_view_buffer(
+ std::unique_ptr<Buffer> buffer) {
+ return std::make_unique<ReadViewBufferImpl>(std::move(buffer));
+}
+
+std::size_t Buffer::write(std::span<uint8_t const> data) {
+ std::size_t offset = 0;
+ while (offset < data.size()) {
+ auto target = wspan();
+ if (target.empty())
+ break;
+ auto size = std::min(data.size() - offset, target.size());
+ std::copy_n(data.data() + offset, size, target.data());
+ commit(size);
+ offset += size;
+ }
+ return offset;
+}
+
+bool Buffer::write_all(std::span<uint8_t const> data) {
+ if (data.empty())
+ return true;
+ auto target = wspan(data.size());
+ if (target.empty())
+ return false;
+ std::copy(data.begin(), data.end(), target.begin());
+ commit(data.size());
+ return true;
+}
+
+std::size_t Buffer::read(std::span<uint8_t> data) {
+ std::size_t offset = 0;
+ while (offset < data.size()) {
+ auto source = rspan();
+ if (source.empty())
+ break;
+ auto size = std::min(data.size() - offset, source.size());
+ std::copy_n(source.data(), size, data.data() + offset);
+ consume(size);
+ offset += size;
+ }
+ return offset;
+}
+
+bool Buffer::read_all(std::span<uint8_t> data) {
+ auto source = rspan(data.size());
+ if (source.size() < data.size())
+ return false;
+ std::copy_n(source.begin(), data.size(), data.begin());
+ consume(data.size());
+ return true;
+}
+
+} // namespace sax
+} // namespace modxml
+
diff --git a/sax/src/buffer.hh b/sax/src/buffer.hh
new file mode 100644
index 0000000..d9fb9fc
--- /dev/null
+++ b/sax/src/buffer.hh
@@ -0,0 +1,108 @@
+#ifndef BUFFER_HH
+#define BUFFER_HH
+
+#include "macros.hh"
+
+#include <memory>
+#include <span>
+
+namespace modxml {
+namespace sax {
+
+class HIDDEN Buffer {
+ public:
+ virtual ~Buffer() = default;
+
+ Buffer(Buffer const&) = delete;
+ Buffer& operator=(Buffer const&) = delete;
+
+ // Returns a writable span, either at least need large or in case
+ // the buffer is full, an empty span.
+ // Returned span is valid until any other method is called on the buffer.
+ virtual std::span<uint8_t> wspan(std::size_t need = 1) = 0;
+ // Commit size data from the last returned wspan. size must be <= span.size.
+ // Remember that the span is now invalid and you need to call wspan again
+ // to write more.
+ virtual void commit(std::size_t size) = 0;
+
+ // Returns a readable span of all readily available data in buffer.
+ // If there is enought data in the buffer to satisfy want, the returned
+ // span is at least as large.
+ // Returned span is valid until any other method is called on the buffer.
+ virtual std::span<uint8_t const> rspan(std::size_t want = 1) = 0;
+ // Consume size data from buffer. size must be <= span.size.
+ // Remember that the span is now invalid and you need to call rspan again
+ // to read more.
+ virtual void consume(std::size_t size) = 0;
+
+ // Returns the same span as rspan but this is writable, you can modify
+ // the content. You cannot change the size of the span.
+ // If you wish to append data, use wspan() + commit().
+ // If you wish to remove data, use uncommit().
+ // If you wish to insert you have to be clever.
+ // Returned span is valid until any other method is called on the buffer.
+ virtual std::span<uint8_t> mspan(std::size_t want = 1) = 0;
+
+ // Uncommit the last size bytes in the buffer. Returns the bytes
+ // removed. If you used wspan() + commit() to add ten (10) bytes say and then
+ // call uncommit() with a size of seven (7) the first three (3) bytes written
+ // will the left in the buffer.
+ virtual std::size_t uncommit(std::size_t size) = 0;
+
+ // Returns true if buffer is empty.
+ virtual bool empty() const = 0;
+
+ // Returns true if buffer is full. This means filled to max_size.
+ virtual bool full() const = 0;
+
+ // Clear buffer, reset back to initial state.
+ virtual void reset() = 0;
+
+ // Write as much as possible of data to buffer.
+ // Returns bytes written (may be zero).
+ std::size_t write(std::span<uint8_t const> data);
+
+ // Either write all of the data to buffer or none. Returns true if data was
+ // written or data was empty.
+ bool write_all(std::span<uint8_t const> data);
+
+ // Read as much as possible from buffer to data.
+ // Returns bytes read (may be zero).
+ std::size_t read(std::span<uint8_t> data);
+
+ // Either fill data with data from buffer or return false.
+ bool read_all(std::span<uint8_t> data);
+
+ protected:
+ Buffer() = default;
+};
+
+// Create a buffer. default_size is used as an hint but generally that
+// will be the initial size of the buffer. max_size is an hard limit.
+// max_size == 0 is valid but will return an always full and empty buffer.
+std::unique_ptr<Buffer> HIDDEN make_buffer(std::size_t default_size,
+ std::size_t max_size);
+
+class ReadViewBuffer : public Buffer {
+ public:
+ // Returns bytes consumed in this buffer.
+ virtual std::size_t consumed() const = 0;
+
+ // Take ownership back of the wrapped buffer from the read view.
+ // The read view is now unusable.
+ virtual std::unique_ptr<Buffer> release() = 0;
+
+ protected:
+ ReadViewBuffer() = default;
+};
+
+// Create a read view buffer. Writing will go to wrapped buffer. Reading
+// is done on the read view buffer without moving the wrapped buffers read
+// pointer. These views are lightweight.
+std::unique_ptr<ReadViewBuffer> HIDDEN make_read_view_buffer(
+ std::unique_ptr<Buffer> buffer);
+
+} // namespace sax
+} // namespace modxml
+
+#endif // BUFFER_HH
diff --git a/sax/src/decoder.cc b/sax/src/decoder.cc
index 30b1735..35b9b46 100644
--- a/sax/src/decoder.cc
+++ b/sax/src/decoder.cc
@@ -12,273 +12,233 @@ namespace sax {
namespace {
-class UtfDecoder : public Decoder {
+class KnownEndianDecoder : public Decoder {
public:
- State decode(std::string_view in, std::size_t& in_offset,
- uint32_t* out, std::size_t out_size,
- std::size_t& out_offset) override {
- std::size_t const out_start = out_offset;
+ State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+ std::span<uint8_t> out, std::size_t& out_offset) override {
+ std::size_t tmp = in_offset;
+ uint32_t ret = read(in, tmp);
+ if (ret == utf::NEED_MORE)
+ return State::NEED_MORE;
+ if (ret == utf::INVALID)
+ return State::INVALID;
+
if (bom_ == -1) UNLIKELY {
- std::size_t tmp = in_offset;
- uint32_t ret = read(in, tmp);
- if (ret == utf::NEED_MORE) {
- return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return State::INVALID;
- }
if (ret == 0xfeff) {
// To allow offset to advance and to return, we need to
// read at least one more character completely.
ret = read(in, tmp);
- if (ret == utf::NEED_MORE) {
+ if (ret == utf::NEED_MORE)
return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
+ if (ret == utf::INVALID)
return State::INVALID;
- }
bom_ = 1;
} else {
bom_ = 0;
}
- in_offset = tmp;
- out[out_offset++] = ret;
- if (out_offset == out_size)
- return State::GOOD;
+ if (!utf::write8(ret, out, out_offset)) {
+ bom_ = -1;
+ return State::NEED_MORE;
+ }
+ } else {
+ if (!utf::write8(ret, out, out_offset))
+ return State::NEED_MORE;
}
+ in_offset = tmp;
- do {
- uint32_t ret = read(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
- return State::GOOD;
+ while (true) {
+ ret = read(in, tmp);
+ if (ret == utf::NEED_MORE || ret == utf::INVALID)
+ return State::GOOD;
+ if (!utf::write8(ret, out, out_offset))
+ return State::GOOD;
+ in_offset = tmp;
+ }
}
protected:
- UtfDecoder() = default;
+ KnownEndianDecoder() = default;
- virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0;
+ virtual uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const = 0;
private:
int8_t bom_{-1};
};
-class Utf8Decoder : public UtfDecoder {
+class Utf8Decoder : public KnownEndianDecoder {
public:
Utf8Decoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read8(data, offset);
}
};
-class Utf16BeDecoder : public UtfDecoder {
+class Utf16BeDecoder : public KnownEndianDecoder {
public:
Utf16BeDecoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read16be(data, offset);
}
};
-class Utf16LeDecoder : public UtfDecoder {
+class Utf16LeDecoder : public KnownEndianDecoder {
public:
Utf16LeDecoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read16le(data, offset);
}
};
-class Utf32BeDecoder : public UtfDecoder {
+class Utf32BeDecoder : public KnownEndianDecoder {
public:
Utf32BeDecoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read32be(data, offset);
}
};
-class Utf32LeDecoder : public UtfDecoder {
+class Utf32LeDecoder : public KnownEndianDecoder {
public:
Utf32LeDecoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read32le(data, offset);
}
};
-class Utf16Decoder : public Decoder {
+class UnknownEndianDecoder : public Decoder {
public:
- Utf16Decoder() = default;
-
- State decode(std::string_view in, std::size_t& in_offset,
- uint32_t* out, std::size_t out_size,
- std::size_t& out_offset) override {
- std::size_t const out_start = out_offset;
+ State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+ std::span<uint8_t> out, std::size_t& out_offset) override {
+ std::size_t tmp = in_offset;
if (endian_ == -1) UNLIKELY {
- std::size_t tmp = in_offset;
- uint32_t ret = utf::read16be(in, tmp);
- int8_t endian;
- if (ret == utf::NEED_MORE) {
+ uint32_t ret = readbe(in, tmp);
+ if (ret == utf::NEED_MORE)
return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
+ if (ret == utf::INVALID)
return State::INVALID;
- }
if (ret == 0xfeff) {
- endian = 1; // Big endian
+ endian_ = 1;
} else if (ret == 0xfffe) {
- endian = 0; // Little endian
+ endian_ = 0;
} else {
return State::INVALID;
}
+ in_offset = tmp;
+ }
- // To allow offset to advance and to return, we need to
- // read at least one more character completely.
- ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp);
- if (ret == utf::NEED_MORE) {
+ if (endian_ == 0) {
+ uint32_t ret = readle(in, tmp);
+ if (ret == utf::NEED_MORE)
return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
+ if (ret == utf::INVALID)
return State::INVALID;
- }
+ if (!utf::write8(ret, out, out_offset))
+ return State::NEED_MORE;
+ in_offset = tmp;
- endian_ = endian;
+ while (true) {
+ ret = readle(in, tmp);
+ if (ret == utf::NEED_MORE || ret == utf::INVALID)
+ return State::GOOD;
+ if (!utf::write8(ret, out, out_offset))
+ return State::GOOD;
+ in_offset = tmp;
+ }
+ } else /* if (endian_ == 1) */ {
+ uint32_t ret = readbe(in, tmp);
+ if (ret == utf::NEED_MORE)
+ return State::NEED_MORE;
+ if (ret == utf::INVALID)
+ return State::INVALID;
+ if (!utf::write8(ret, out, out_offset))
+ return State::NEED_MORE;
in_offset = tmp;
- out[out_offset++] = ret;
- if (out_offset == out_size)
- return State::GOOD;
- }
- if (endian_ == 1) {
- do {
- uint32_t ret = utf::read16be(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
- } else {
- do {
- uint32_t ret = utf::read16le(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
+ while (true) {
+ ret = readbe(in, tmp);
+ if (ret == utf::NEED_MORE || ret == utf::INVALID)
+ return State::GOOD;
+ if (!utf::write8(ret, out, out_offset))
+ return State::GOOD;
+ in_offset = tmp;
+ }
}
- return State::GOOD;
}
+ protected:
+ UnknownEndianDecoder() = default;
+
+ virtual uint32_t readle(
+ std::span<uint8_t const> data, std::size_t& offset) const = 0;
+ virtual uint32_t readbe(
+ std::span<uint8_t const> data, std::size_t& offset) const = 0;
+
private:
int8_t endian_{-1};
};
-class Utf32Decoder : public Decoder {
+class Utf16Decoder : public UnknownEndianDecoder {
public:
- Utf32Decoder() = default;
+ Utf16Decoder() = default;
- State decode(std::string_view in, std::size_t& in_offset,
- uint32_t* out, std::size_t out_size,
- std::size_t& out_offset) override {
- std::size_t const out_start = out_offset;
- if (endian_ == -1) UNLIKELY {
- std::size_t tmp = in_offset;
- uint32_t ret = utf::read32be(in, tmp);
- int8_t endian;
- if (ret == utf::NEED_MORE) {
- return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- tmp = in_offset;
- ret = utf::read32le(in, tmp);
- if (ret == 0xfeff) {
- endian = 0; // Little endian
- } else {
- return State::INVALID;
- }
- } else if (ret == 0xfeff) {
- endian = 1; // Big endian
- } else {
- return State::INVALID;
- }
+ uint32_t readle(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
+ return utf::read16le(data, offset);
+ }
- // To allow offset to advance and to return, we need to
- // read the next character completely.
- ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp);
- if (ret == utf::NEED_MORE) {
- return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return State::INVALID;
- }
+ uint32_t readbe(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
+ return utf::read16be(data, offset);
+ }
+};
- endian_ = endian;
- in_offset = tmp;
- out[out_offset++] = ret;
- if (out_offset == out_size)
- return State::GOOD;
- }
+class Utf32Decoder : public UnknownEndianDecoder {
+ public:
+ Utf32Decoder() = default;
- if (endian_ == 1) {
- do {
- uint32_t ret = utf::read32be(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
- } else {
- do {
- uint32_t ret = utf::read32le(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
- }
- return State::GOOD;
+ uint32_t readle(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
+ return utf::read32le(data, offset);
}
- private:
- int8_t endian_{-1};
+ uint32_t readbe(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
+ return utf::read32be(data, offset);
+ }
};
class AsciiDecoder : public Decoder {
public:
AsciiDecoder() = default;
- State decode(std::string_view in, std::size_t& in_offset,
- uint32_t* out, std::size_t out_size,
- std::size_t& out_offset) override {
- std::size_t const out_start = out_offset;
- do {
- if (in_offset == in.size())
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- if (in[in_offset] & 0x80)
- return out_offset > out_start ? State::GOOD : State::INVALID;
- out[out_offset++] = in[in_offset++];
- } while (out_offset < out_size);
- return State::GOOD;
+ State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+ std::span<uint8_t> out, std::size_t& out_offset) override {
+ if (in_offset >= in.size())
+ return State::NEED_MORE;
+ if (in[in_offset] & 0x80)
+ return State::INVALID;
+ if (!utf::write8(in[in_offset], out, out_offset))
+ return State::NEED_MORE;
+ ++in_offset;
+
+ while (true) {
+ if (in_offset >= in.size() || in[in_offset] & 0x80)
+ return State::GOOD;
+ if (!utf::write8(in[in_offset], out, out_offset))
+ return State::GOOD;
+ ++in_offset;
+ }
}
};
diff --git a/sax/src/guessing_decoder.cc b/sax/src/guessing_decoder.cc
new file mode 100644
index 0000000..e72dab3
--- /dev/null
+++ b/sax/src/guessing_decoder.cc
@@ -0,0 +1,92 @@
+#include "guessing_decoder.hh"
+
+#include "decoder.hh"
+#include "sax_decoder.hh"
+#include "utf8.hh"
+#include "utf_error.hh"
+
+#include <cassert>
+
+using namespace std::string_view_literals;
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+bool eq(std::span<uint8_t const> a, std::size_t& a_offset, std::string_view b) {
+ if (a.size() - a_offset < b.size())
+ return false;
+ for (size_t i = 0; i < b.size(); ++i)
+ if (a[a_offset + i] != b[i])
+ return false;
+ return true;
+}
+
+class GuessingDecoder : public Decoder {
+ public:
+ State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+ std::span<uint8_t> out, std::size_t& out_offset) override {
+ assert(in_offset <= in.size());
+
+ if (!decided_) {
+ if (eq(in, in_offset, "\xef\xbb\xbf"sv)) {
+ decided_ = create_utf8_decoder();
+ } else if (eq(in, in_offset, "\xfe\xff\x00\x00"sv)) {
+ in_offset += 4;
+ decided_ = create_utf32be_decoder();
+ } else if (eq(in, in_offset, "\xfe\xff"sv)) {
+ // Could be UTF-32 BOM, need more data to decide
+ // (note, an xml document encoded in UTF-16 that is less than 4 bytes
+ // is rather impossible).
+ if (in.size() - in_offset < 4)
+ return State::NEED_MORE;
+ in_offset += 2;
+ decided_ = create_utf16be_decoder();
+ } else if (eq(in, in_offset, "\xff\xfe"sv)) {
+ in_offset += 2;
+ decided_ = create_utf16le_decoder();
+ } else if (eq(in, in_offset, "\x00\x00\xff\xfe"sv)) {
+ in_offset += 4;
+ decided_ = create_utf32le_decoder();
+ } else {
+ auto avail = in.size() - in_offset;
+ if (avail == 0)
+ return State::NEED_MORE;
+ if (avail >= 4 && in[in_offset] == 0 && in[in_offset + 1] == 0
+ && in[in_offset + 2] == 0 && in[in_offset + 3] != 0) {
+ decided_ = create_utf32le_decoder();
+ } else if (avail >= 4 && in[in_offset] != 0 && in[in_offset + 1] == 0
+ && in[in_offset + 2] == 0 && in[in_offset + 3] == 0) {
+ decided_ = create_utf32be_decoder();
+ } else if (avail >= 2 && in[in_offset] == 0 && in[in_offset + 1] != 0) {
+ decided_ = create_utf16le_decoder();
+ } else if (avail >= 2 && in[in_offset] != 0 && in[in_offset + 1] == 0) {
+ decided_ = create_utf16be_decoder();
+ } else {
+ auto tmp = in_offset;
+ auto ret = utf::read8(in, tmp);
+ if (ret == utf::NEED_MORE)
+ return State::NEED_MORE;
+ if (ret == utf::INVALID)
+ return State::INVALID;
+ // UTF-8 should be good enough to read the XML declaration.
+ decided_ = create_utf8_decoder();
+ }
+ }
+ }
+ return decided_->decode(in, in_offset, out, out_offset);
+ }
+
+ private:
+ std::unique_ptr<Decoder> decided_;
+};
+
+} // namespace
+
+std::unique_ptr<Decoder> create_guessing_decoder() {
+ return std::make_unique<GuessingDecoder>();
+}
+
+} // namespace sax
+} // namespace modxml
diff --git a/sax/src/guessing_decoder.hh b/sax/src/guessing_decoder.hh
new file mode 100644
index 0000000..0f42c3b
--- /dev/null
+++ b/sax/src/guessing_decoder.hh
@@ -0,0 +1,21 @@
+#ifndef GUESSING_DECODER_HH
+#define GUESSING_DECODER_HH
+
+#include "macros.hh"
+
+#include <memory>
+
+namespace modxml {
+namespace sax {
+
+class Decoder;
+
+// Decoder that tries to figure out, using BOM or just magic
+// what encoding is used, optimized for the first character to be
+// '<'.
+std::unique_ptr<Decoder> HIDDEN create_guessing_decoder();
+
+} // namespace sax
+} // namespace modxml
+
+#endif // GUESSING_DECODER_HH
diff --git a/sax/src/sax_attributes.cc b/sax/src/sax_attributes.cc
new file mode 100644
index 0000000..230c677
--- /dev/null
+++ b/sax/src/sax_attributes.cc
@@ -0,0 +1,38 @@
+#include "sax_attributes.hh"
+
+namespace modxml {
+namespace sax {
+
+Attribute::Attribute(std::string_view name, std::string_view value)
+ : name(name), value(value) {}
+
+std::optional<std::string_view> Attributes::find_first(std::string_view name)
+ const {
+ for (auto it = begin(); it != end(); ++it) {
+ if (it->name == name)
+ return it->value;
+ }
+ return std::nullopt;
+}
+
+std::optional<std::string_view> Attributes::find_last(std::string_view name)
+ const {
+ for (size_t i = size(); i > 0; --i) {
+ auto const& a = at(i - 1);
+ if (a.name == name)
+ return a.value;
+ }
+ return std::nullopt;
+}
+
+std::optional<std::size_t> Attributes::find(std::string_view name,
+ std::size_t index) const {
+ for (; index < size(); ++index) {
+ if (at(index).name == name)
+ return index;
+ }
+ return std::nullopt;
+}
+
+} // namespace sax
+} // namespace modxml
diff --git a/sax/src/sax_delegate.cc b/sax/src/sax_delegate.cc
new file mode 100644
index 0000000..2c2cfcd
--- /dev/null
+++ b/sax/src/sax_delegate.cc
@@ -0,0 +1,21 @@
+#include "sax_delegate.hh"
+
+namespace modxml {
+namespace sax {
+
+void Delegate::start_element(std::string_view, Attributes const&) {}
+
+void Delegate::empty_element(std::string_view, Attributes const&) {}
+
+void Delegate::end_element(std::string_view) {}
+
+void Delegate::character_data(std::string_view) {}
+
+void Delegate::processing_instruction(std::string_view, std::string_view) {}
+
+void Delegate::comment(std::string_view) {}
+
+void Delegate::error(std::string_view) {}
+
+} // namespace sax
+} // namespace modxml
diff --git a/sax/src/sax_processor.cc b/sax/src/sax_processor.cc
index ea9f753..afc9d3b 100644
--- a/sax/src/sax_processor.cc
+++ b/sax/src/sax_processor.cc
@@ -1,18 +1,41 @@
#include "sax_processor.hh"
-#include "sax_decoder.hh"
+#include <iostream>
+
+#include "buffer.hh"
+#include "guessing_decoder.hh"
#include "processor.hh"
+#include "sax_attributes.hh"
+#include "sax_decoder.hh"
+#include "sax_decoder_factory.hh"
+#include "sax_delegate.hh"
+#include "utf8.hh"
+#include "utf_error.hh"
#include "utils.hh"
#include <algorithm>
+#include <cassert>
+#include <charconv>
+#include <format>
+#include <map>
#include <optional>
#include <utility>
+#include <vector>
+
+using namespace std::string_view_literals;
namespace modxml {
namespace sax {
namespace {
+constexpr std::size_t kDefaultBufferSize = 8192;
+constexpr std::size_t kMinBufferSize = 128;
+
+inline bool is_digit(char c) {
+ return c >= '0' && c <= '9';
+}
+
// 2.2 Characters
// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
@@ -75,12 +98,185 @@ inline bool is_namechar(uint32_t c) {
(c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040);
}
-/* [5] Name ::= NameStartChar (NameChar)*
+/*
+[5] Name ::= NameStartChar (NameChar)*
[6] Names ::= Name (#x20 Name)*
[7] Nmtoken ::= (NameChar)+
[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
*/
+inline bool ascii_lowercase(char c) {
+ return (c >= 'A' & c <= 'Z') ? (c | 0x20) : c;
+}
+
+bool eq_lowercase(std::string_view a, std::string_view b) {
+ if (a.size() != b.size())
+ return false;
+ for (std::size_t i = 0; i < a.size(); ++i)
+ if (ascii_lowercase(a[i]) != b[i])
+ return false;
+ return true;
+}
+
+inline std::string_view make_string_view(std::span<uint8_t const> span) {
+ return std::string_view(reinterpret_cast<char const*>(span.data()),
+ span.size());
+}
+
+class Entities {
+ public:
+ Entities() {
+ data_.emplace("lt", "<");
+ data_.emplace("gt", ">");
+ data_.emplace("amp", "&");
+ data_.emplace("apos", "'");
+ data_.emplace("quot", "\"");
+ }
+
+ std::optional<std::string> get(std::string const& entity) const {
+ if (entity.empty())
+ return std::nullopt;
+ if (entity.front() == '#') {
+ if (entity.size() == 1)
+ return std::nullopt;
+ int base;
+ char const* start;
+ char const* end = entity.data() + entity.size();
+ if (entity[1] == 'x') {
+ start = entity.data() + 2;
+ base = 16;
+ } else {
+ start = entity.data() + 1;
+ base = 10;
+ }
+ uint32_t value;
+ auto [ptr, ec] = std::from_chars(start, end, value, base);
+ if (ec == std::errc() && ptr == end) {
+ uint8_t tmp[4];
+ std::size_t offset = 0;
+ utf::write8(value, tmp, offset);
+ return std::string(reinterpret_cast<char*>(tmp), offset);
+ }
+ }
+ auto it = data_.find(entity);
+ if (it == data_.end())
+ return std::nullopt;
+ return it->second;
+ }
+
+ private:
+ std::map<std::string, std::string> data_;
+};
+
+bool deamp(Entities const& entities, std::string& str, std::size_t last = 0) {
+ while (true) {
+ auto next = str.find('&', last);
+ if (next == std::string::npos)
+ break;
+ next += 1;
+ auto semicolon = str.find(';', next);
+ if (semicolon == std::string::npos)
+ return false;
+ auto replacement = entities.get(str.substr(next, semicolon - next));
+ if (!replacement.has_value())
+ return false;
+ }
+ return true;
+}
+
+std::optional<std::string> unquote(Entities const& entities,
+ std::string_view quoted) {
+ assert(quoted.size() >= 2);
+ assert(quoted.front() == quoted.back());
+ std::string ret(quoted.substr(1, quoted.size() - 2));
+ if (deamp(entities, ret))
+ return ret;
+ return std::nullopt;
+}
+
+std::optional<std::string_view> unquote_if_needed(Entities const& entities,
+ std::string_view quoted,
+ std::string& tmp) {
+ assert(quoted.size() >= 2);
+ assert(quoted.front() == quoted.back());
+ auto input = quoted.substr(1, quoted.size() - 2);
+ auto index = input.find('&');
+ if (index == std::string_view::npos)
+ return input;
+ tmp.assign(input);
+ if (deamp(entities, tmp, index))
+ return tmp;
+ return std::nullopt;
+}
+
+class AttributesImpl : public Attributes {
+ public:
+ AttributesImpl() = default;
+
+ bool init(Entities const& entities,
+ std::span<const uint8_t> data,
+ std::vector<size_t> const& offsets,
+ std::size_t first) {
+ std::size_t a = first;
+ attr_.reserve((offsets.size() - first) / 4);
+ while (a + 4 <= offsets.size()) {
+ auto name = make_string_view(data.subspan(offsets[a], offsets[a + 1]));
+ std::string tmp;
+ auto value = unquote_if_needed(
+ entities,
+ make_string_view(data.subspan(offsets[a + 2], offsets[a + 3])),
+ tmp);
+ if (!value.has_value())
+ return false;
+ if (tmp.empty()) {
+ attr_.emplace_back(name, *value);
+ } else {
+ attr_.emplace_back(name, *value, std::move(tmp));
+ }
+ a += 4;
+ }
+ return true;
+ }
+
+ iterator begin() const override {
+ return Iterator(this, 0);
+ }
+
+ iterator end() const override {
+ return Iterator(this, attr_.size());
+ }
+
+ std::size_t size() const override {
+ return attr_.size();
+ }
+
+ Attribute const& at(std::size_t index) const override {
+ return attr_[index];
+ }
+
+ private:
+ class Iterator : public iterator {
+ public:
+ Iterator(Attributes const* attributes, std::size_t index)
+ : iterator(attributes, index) {}
+ };
+
+ struct AttributeImpl : public Attribute {
+ AttributeImpl(std::string_view name, std::string_view value)
+ : Attribute(name, value) {}
+
+ AttributeImpl(std::string_view name, std::string_view value,
+ std::string&& tmp)
+ : Attribute(name, value), tmp_(std::move(tmp)) {}
+
+ private:
+ std::string tmp_;
+ };
+
+ std::span<const uint8_t> data_;
+ std::vector<AttributeImpl> attr_;
+};
+
class ProcessorImpl : public Processor {
public:
ProcessorImpl(std::shared_ptr<Delegate> delegate,
@@ -91,15 +287,898 @@ class ProcessorImpl : public Processor {
: delegate_(std::move(delegate)),
decoder_factory_(std::move(decoder_factory)),
decoder_(std::move(decoder)),
- default_buffer_size_(default_buffer_size),
- max_buffer_size_(max_buffer_size) {}
+ forced_decoder_(decoder_),
+ buffer_(make_buffer(default_buffer_size, max_buffer_size)) {
+ if (!decoder_)
+ decoder_ = create_guessing_decoder();
+
+ expect_document();
+ }
+
+ std::size_t process(std::span<uint8_t const> data,
+ std::size_t offset) override {
+ cmds_.emplace_back(Command::FILL_BUFFER, Count::ZERO_OR_ONE);
+
+ std::size_t consumed = 0;
+
+ while (true) {
+ if (cmds_.empty()) {
+ if (!buffer_->empty()) {
+ std::cerr << make_string_view(buffer_->rspan()) << std::endl;
+ delegate_->error("Extra data at end");
+ }
+ return consumed;
+ }
+
+ auto current = cmds_.back();
+ auto const old_size = cmds_.size();
+ cmds_.pop_back();
+ Process ret;
+ switch (current.command) {
+ case Command::FILL_BUFFER:
+ ret = fill_buffer(data, offset, consumed);
+ break;
+ case Command::MISC:
+ ret = process_misc(current);
+ break;
+ case Command::SPACE:
+ ret = process_space(current);
+ break;
+ case Command::ELEMENT:
+ ret = process_element(current);
+ break;
+ case Command::COMMENT:
+ ret = process_comment(current);
+ break;
+ case Command::PROCESSING_INSTRUCTION:
+ ret = process_processing_instruction(current);
+ break;
+ case Command::XMLDECL:
+ ret = process_xmldecl(current);
+ break;
+ case Command::ATTRIBUTE:
+ ret = process_attribute(current);
+ break;
+ case Command::NAME:
+ ret = process_name(current);
+ break;
+ case Command::ATTRIBUTE_VALUE:
+ ret = process_attribute_value(current);
+ break;
+ case Command::EQUAL:
+ ret = process_equal(current);
+ break;
+ case Command::START_OR_EMPTY_TAG:
+ ret = process_start_or_empty_tag(current);
+ break;
+ case Command::END_TAG:
+ ret = process_end_tag(current);
+ break;
+ }
+
+ switch (ret) {
+ case Process::NEED_MORE:
+ case Process::ERROR:
+ cmds_.push_back(current);
+ assert(cmds_.size() == old_size);
+ return consumed;
+ case Process::CONTINUE:
+ break;
+ }
+ }
+ }
+
+ uint64_t line() const override { return line_; }
+
+ uint64_t column() const override { return column_; }
private:
+ enum class Process {
+ NEED_MORE,
+ ERROR,
+ CONTINUE,
+ };
+
+ enum class Match {
+ FULL_MATCH,
+ PARTIAL_MATCH,
+ NO_MATCH,
+ };
+
+ enum class Command {
+ FILL_BUFFER,
+
+ ATTRIBUTE,
+ ATTRIBUTE_VALUE,
+ COMMENT,
+ ELEMENT,
+ END_TAG,
+ EQUAL,
+ MISC,
+ NAME,
+ PROCESSING_INSTRUCTION,
+ SPACE,
+ START_OR_EMPTY_TAG,
+ XMLDECL,
+ };
+
+ enum class Count {
+ ONE,
+ ONE_OR_MANY,
+ ZERO_OR_ONE,
+ ZERO_OR_MANY,
+ };
+
+ struct CommandItem {
+ Command const command;
+ Count const count;
+ std::size_t offset;
+
+ CommandItem(Command command, Count count, std::size_t offset = 0)
+ : command(command), count(count), offset(offset) {}
+ };
+
+ struct StackItem {
+ std::vector<std::size_t> offsets;
+ };
+
+ Process fill_buffer(std::span<uint8_t const> data,
+ std::size_t offset,
+ std::size_t& consumed) {
+ if (offset >= data.size())
+ return Process::NEED_MORE;
+
+ std::size_t tmp = offset;
+ auto wspan = buffer_->wspan(4);
+ switch (decoder_->decode(data, tmp, wspan, consumed)) {
+ case Decoder::State::GOOD:
+ break;
+ case Decoder::State::NEED_MORE:
+ return Process::NEED_MORE;
+ case Decoder::State::INVALID:
+ delegate_->error("Invalid data");
+ return Process::ERROR;
+ }
+ buffer_->commit(consumed);
+ return Process::CONTINUE;
+ }
+
+ void expect_document() {
+ // document := prolog element Misc*
+ expect_misc(Count::ZERO_OR_MANY);
+ expect_element(Count::ONE);
+ expect_prolog();
+ }
+
+ void expect_misc(Count count) {
+ cmds_.emplace_back(Command::MISC, count);
+ }
+
+ void expect_element(Count count) {
+ // element ::= EmptyElemTag | STag content ETag
+ cmds_.emplace_back(Command::START_OR_EMPTY_TAG, count);
+ }
+
+ void expect_end_tag(Count count) {
+ cmds_.emplace_back(Command::END_TAG, count);
+ }
+
+ void expect_prolog() {
+ // prolog := XMLDecl? Misc* (doctypedecl Misc*)?
+ expect_misc(Count::ZERO_OR_MANY);
+ expect_doctypedecl(Count::ZERO_OR_ONE);
+ expect_misc(Count::ZERO_OR_MANY);
+ expect_xmldecl(Count::ZERO_OR_ONE);
+ }
+
+ void expect_xmldecl(Count count) {
+ cmds_.emplace_back(Command::XMLDECL, count);
+ }
+
+ void expect_doctypedecl(Count) {
+ // TODO
+ }
+
+ void expect_comment(Count count, std::size_t start_offset = 0) {
+ // Comment should never be more than one, should be MISC that is repeated.
+ assert(count == Count::ONE);
+ cmds_.emplace_back(Command::COMMENT, count, start_offset);
+ }
+
+ void expect_content(Count) {
+ // TODO
+ }
+
+ void expect_pi(Count count, std::size_t start_offset = 0) {
+ // PI should never be more than one, should be MISC that is repeated.
+ assert(count == Count::ONE);
+ cmds_.emplace_back(Command::PROCESSING_INSTRUCTION, count, start_offset);
+ }
+
+ void expect_space(Count count) {
+ // There is not way to have SS as S is continous, so we should never
+ // ask for more than one or zero.
+ assert(count == Count::ZERO_OR_ONE || count == Count::ONE);
+ cmds_.emplace_back(Command::SPACE, count);
+ }
+
+ void expect_attribute(Count count) {
+ switch (count) {
+ case Count::ONE_OR_MANY:
+ cmds_.emplace_back(Command::ATTRIBUTE, Count::ZERO_OR_MANY);
+ case Count::ONE:
+ // Attribute ::= Name Eq AttValue
+ expect_attribute_value(Count::ONE);
+ expect_equal(Count::ONE);
+ expect_name(Count::ONE);
+ expect_space(Count::ONE);
+ break;
+ case Count::ZERO_OR_ONE:
+ case Count::ZERO_OR_MANY:
+ cmds_.emplace_back(Command::ATTRIBUTE, count);
+ break;
+ }
+ }
+
+ void expect_attribute_value(Count count) {
+ cmds_.emplace_back(Command::ATTRIBUTE_VALUE, count);
+ }
+
+ void expect_equal(Count count) {
+ // Eq ::= S? '=' S?
+ expect_space(Count::ZERO_OR_ONE);
+ cmds_.emplace_back(Command::EQUAL, count);
+ expect_space(Count::ZERO_OR_ONE);
+ }
+
+ void expect_name(Count count) {
+ cmds_.emplace_back(Command::NAME, count);
+ }
+
+ Process process_misc(CommandItem const& item) {
+ // Misc := Comment | PI | S
+ assert(item.offset == 0);
+
+ switch (match("<!--")) {
+ case Match::FULL_MATCH:
+ add_if_more(item);
+ expect_comment(Count::ONE, 3);
+ return Process::CONTINUE;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ break;
+ }
+
+ switch (match("<?")) {
+ case Match::FULL_MATCH:
+ add_if_more(item);
+ expect_pi(Count::ONE, 2);
+ return Process::CONTINUE;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ break;
+ }
+
+ switch (match_s()) {
+ case Match::FULL_MATCH:
+ add_if_more(item);
+ expect_space(Count::ONE);
+ return Process::CONTINUE;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ break;
+ }
+
+ return no_match(item);
+ }
+
+ Process process_attribute(CommandItem& item) {
+ // This actually parses (S Attribute)* when followed by S?
+ // for Attribute parsing see expect_attribute()
+ // So we need to figure out if the S means start of attribute
+ // or just an S. We do this by checking if the first non-S is
+ // a namestart or something else. We consume the S.
+ uint32_t last_char;
+ auto ret = consume_space(item.offset, last_char);
+ if (ret != Process::CONTINUE)
+ return ret;
+
+ // No S, cannot be followed by an attribute then.
+ if (item.offset == 0)
+ return no_match(item);
+
+ // First character after S isn't a valid first character of a name,
+ // cannot be followed by an attribute then.
+ if (!is_namestartchar(last_char))
+ return no_match(item);
+
+ expect_attribute_value(Count::ONE);
+ expect_equal(Count::ONE);
+ expect_name(Count::ONE);
+ return Process::CONTINUE;
+ }
+
+ Process process_equal(CommandItem const& item) {
+ // Eq ::= S? '=' S?
+ // Spacing added by expect_equal
+ switch (match_consume("=")) {
+ case Match::FULL_MATCH:
+ add_if_more(item);
+ return Process::CONTINUE;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ return no_match(item);
+ }
+ }
+
+ Process process_name(CommandItem& item) {
+ // Name ::= NameStartChar (NameChar)*
+ auto data = buffer_->rspan(item.offset + 4);
+ while (true) {
+ std::size_t tmp = item.offset;
+ auto c = utf::read8(data, tmp);
+ if (c == utf::NEED_MORE)
+ return Process::NEED_MORE;
+ if (c == utf::INVALID || !valid_char(c))
+ return invalid_char(data, tmp);
+ if (item.offset == 0) {
+ if (!is_namestartchar(c))
+ return no_match(item);
+ } else {
+ if (!is_namechar(c))
+ break;
+ }
+ item.offset = tmp;
+ }
+
+ assert(!stack_.empty());
+ auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get());
+ stack_.back().offsets.push_back(read_view->consumed());
+ stack_.back().offsets.push_back(item.offset);
+ buffer_->consume(item.offset);
+ return Process::CONTINUE;
+ }
+
+ Process process_attribute_value(CommandItem& item) {
+ // AttValue ::= '"' ([^<&"] | Reference)* '"'
+ // | "'" ([^<&'] | Reference)* "'"
+
+ uint32_t end_char;
+ auto data = buffer_->rspan(item.offset + 4);
+
+ if (item.offset == 0) {
+ std::size_t tmp = item.offset;
+ auto c = utf::read8(data, tmp);
+ if (c == utf::NEED_MORE)
+ return Process::NEED_MORE;
+ if (c == utf::INVALID || !valid_char(c))
+ return invalid_char(data, tmp);
+ if (c != '"' && c != '\'')
+ return no_match(item);
+ item.offset = tmp;
+ end_char = c;
+ } else {
+ assert(!data.empty());
+ end_char = data[0]; // ok as both " and ' are ASCII
+ }
+
+ while (true) {
+ auto c = utf::read8(data, item.offset);
+ if (c == utf::NEED_MORE)
+ return Process::NEED_MORE;
+ if (c == utf::INVALID || !valid_char(c))
+ return invalid_char(data, item.offset);
+ if (c == end_char)
+ break;
+ // TODO: Should we validate reference already here or do we let
+ // unquoute take care of that? As Reference can't contain end_char
+ // only checking for end_char is safe here.
+ }
+
+ assert(!stack_.empty());
+ auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get());
+ stack_.back().offsets.push_back(read_view->consumed());
+ stack_.back().offsets.push_back(item.offset);
+ buffer_->consume(item.offset);
+ return Process::CONTINUE;
+ }
+
+ Process process_comment(CommandItem& item) {
+ if (item.offset == 0) {
+ switch (match_consume("<!--")) {
+ case Match::FULL_MATCH:
+ item.offset += 3;
+ break;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ return no_match(item);
+ }
+ }
+
+ auto match = find("-->", item.offset);
+ switch (match) {
+ case Match::FULL_MATCH: {
+ auto data = buffer_->rspan(item.offset);
+ assert(data.size() >= item.offset);
+ delegate_->comment(
+ make_string_view(data.subspan(3, item.offset - 3)));
+ buffer_->consume(item.offset + 3);
+ return Process::CONTINUE;
+ }
+ case Match::NO_MATCH:
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ }
+ }
+
+ Process process_processing_instruction(CommandItem& item) {
+ if (item.offset == 0) {
+ switch (match_consume("<?")) {
+ case Match::FULL_MATCH:
+ item.offset += 2;
+ break;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ return no_match(item);
+ }
+ }
+
+ // TODO
+ delegate_->error("PI not supported");
+ return Process::ERROR;
+ }
+
+ void add_to_stack(CommandItem const& item, std::size_t offset) {
+ cmds_.emplace_back(item.command, item.count, offset);
+ stack_.emplace_back();
+ buffer_ = make_read_view_buffer(std::move(buffer_));
+ buffer_->consume(offset);
+ }
+
+ std::size_t pop_stack(std::vector<std::size_t>& attr) {
+ assert(!stack_.empty());
+ std::swap(attr, stack_.back().offsets);
+
+ auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get());
+ auto consumed = read_view->consumed();
+
+ buffer_ = read_view->release();
+ stack_.pop_back();
+
+ return consumed;
+ }
+
+ Process process_xmldecl(CommandItem const& item) {
+ // XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
+ if (item.offset == 0) {
+ switch (match("<?xml")) {
+ case Match::FULL_MATCH:
+ add_to_stack(item, /* offset */ 5);
+ expect_space(Count::ZERO_OR_ONE);
+ // Parsing as generic "Attribute" here and doing validation later.
+ expect_attribute(Count::ONE_OR_MANY);
+ return Process::CONTINUE;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ return no_match(item);
+ }
+ }
+
+ assert(item.offset == 5);
+
+ // Remember that this is still reading for the read view buffer.
+ switch (match_consume("?>")) {
+ case Match::FULL_MATCH:
+ break;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ delegate_->error(std::format("Expected end of {}",
+ command_name(item.command)));
+ return Process::ERROR;
+ }
+
+ std::vector<std::size_t> attr;
+ auto const consumed = pop_stack(attr);
+
+ // Now we're back to the real buffer
+ auto data = buffer_->rspan(consumed);
+ std::size_t a = 0;
+
+ if (a + 4 <= attr.size() &&
+ make_string_view(data.subspan(attr[a + 0],
+ attr[a + 1])) == "version") {
+ auto version = make_string_view(data.subspan(attr[a + 2] + 1,
+ attr[a + 3] - 2));
+ if (!valid_version(version)) {
+ delegate_->error(std::format("Unsupported xmldecl version, {}",
+ version));
+ return Process::ERROR;
+ }
+ a += 4;
+ } else {
+ // No version
+ delegate_->error("Invalid xmldecl, must have a version attribute first.");
+ return Process::ERROR;
+ }
+
+ if (a + 4 <= attr.size() &&
+ make_string_view(data.subspan(attr[a + 0],
+ attr[a + 1])) == "encoding") {
+ auto encoding = make_string_view(data.subspan(attr[a + 2] + 1,
+ attr[a + 3] - 2));
+ if (forced_decoder_) {
+ // encoding value is ignored
+ // TODO: Should we check that it is valid anyway?
+ } else {
+ auto decoder = pick_decoder_for_encoding(encoding, nullptr);
+ if (!decoder && decoder_factory_)
+ decoder = decoder_factory_->create(encoding);
+ if (!decoder) {
+ delegate_->error(std::format("Unknown encoding {}", encoding));
+ return Process::ERROR;
+ }
+ std::swap(decoder_, decoder);
+ // TODO: Re-decode the rest of the buffer?
+ }
+ a += 4;
+ }
+
+ if (a + 4 <= attr.size() &&
+ make_string_view(data.subspan(attr[a + 0],
+ attr[a + 1])) == "standalone") {
+ auto sd = make_string_view(data.subspan(attr[a + 2] + 1,
+ attr[a + 3] - 2));
+ if (sd == "yes") {
+ // TODO: Handle standalone == yes
+ } else if (sd == "no") {
+ // TODO: Handle standalone == no
+ } else {
+ delegate_->error(std::format(
+ "Invalid xmldecl, standalone attribute has unsupported value, {}",
+ sd));
+ return Process::ERROR;
+ }
+ a += 4;
+ }
+
+ if (a < attr.size()) {
+ delegate_->error(
+ std::format("Invalid xmldecl, unknown attribute, {}",
+ make_string_view(data.subspan(attr[a + 0],
+ attr[a + 1]))));
+ return Process::ERROR;
+ }
+
+ buffer_->consume(consumed);
+ return Process::CONTINUE;
+ }
+
+ Process process_start_or_empty_tag(CommandItem const& item) {
+ // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
+ // STag ::= '<' Name (S Attribute)* S? '>'
+ if (item.offset == 0) {
+ switch (match("<")) {
+ case Match::FULL_MATCH:
+ add_to_stack(item, /* offset */ 1);
+ expect_space(Count::ZERO_OR_ONE);
+ expect_attribute(Count::ZERO_OR_MANY);
+ expect_name(Count::ONE);
+ return Process::CONTINUE;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ return no_match(item);
+ }
+ }
+
+ assert(item.offset == 1);
+
+ bool empty_tag;
+
+ // Remember that this is still reading for the read view buffer.
+ switch (match_consume("/>")) {
+ case Match::FULL_MATCH:
+ empty_tag = true;
+ break;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ switch (match_consume(">")) {
+ case Match::FULL_MATCH:
+ empty_tag = false;
+ break;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ delegate_->error(std::format("Expected end of {}",
+ command_name(item.command)));
+ return Process::ERROR;
+ }
+ break;
+ }
+
+ std::vector<std::size_t> attr;
+ auto const consumed = pop_stack(attr);
+
+ // Now we're back to the real buffer
+ auto data = buffer_->rspan(consumed);
+
+ assert(attr.size() >= 2);
+ auto name = make_string_view(data.subspan(attr[0], attr[1]));
+
+ AttributesImpl attributes;
+ if (!attributes.init(entities_, data, std::move(attr), 2)) {
+ delegate_->error("Invalid references in attribute values");
+ return Process::ERROR;
+ }
+
+ add_if_more(item);
+
+ if (empty_tag) {
+ delegate_->empty_element(name, attributes);
+ } else {
+ delegate_->start_element(name, attributes);
+ expect_end_tag(Count::ONE);
+ expect_content(Count::ONE);
+ }
+
+ buffer_->consume(consumed);
+ return Process::CONTINUE;
+ }
+
+ Process process_end_tag(CommandItem const& item) {
+ // ETag ::= '</' Name S? '>'
+ if (item.offset == 0) {
+ switch (match("</")) {
+ case Match::FULL_MATCH:
+ add_to_stack(item, /* offset */ 2);
+ expect_space(Count::ZERO_OR_ONE);
+ expect_name(Count::ONE);
+ return Process::CONTINUE;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ return no_match(item);
+ }
+ }
+
+ assert(item.offset == 1);
+
+ // Remember that this is still reading for the read view buffer.
+ switch (match_consume(">")) {
+ case Match::FULL_MATCH:
+ break;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ delegate_->error(std::format("Expected end of {}",
+ command_name(item.command)));
+ return Process::ERROR;
+ }
+
+ std::vector<std::size_t> attr;
+ auto const consumed = pop_stack(attr);
+
+ // Now we're back to the real buffer
+ auto data = buffer_->rspan(consumed);
+
+ assert(attr.size() == 2);
+ auto name = make_string_view(data.subspan(attr[0], attr[1]));
+
+ add_if_more(item);
+
+ delegate_->end_element(name);
+
+ buffer_->consume(consumed);
+ return Process::CONTINUE;
+ }
+
+ static bool valid_version(std::string_view version) {
+ if (version.size() < 3)
+ return false;
+ if (!version.starts_with("1."))
+ return false;
+ for (std::size_t i = 2; i < version.size(); ++i) {
+ if (!is_digit(version[i]))
+ return false;
+ }
+ return true;
+ }
+
+ Process process_element(CommandItem& item) {
+ // TODO
+ delegate_->error("Element is not yet supported");
+ return Process::ERROR;
+ }
+
+ Process consume_space(std::size_t& count, uint32_t& last_char) {
+ auto data = buffer_->rspan(4);
+ std::size_t consumed = 0;
+ while (true) {
+ std::size_t offset = consumed;
+ auto c = utf::read8(data, offset);
+ if (c == utf::NEED_MORE) {
+ buffer_->consume(consumed);
+ return Process::NEED_MORE;
+ }
+ if (c == utf::INVALID || !valid_char(c))
+ return invalid_char(data, offset);
+ if (!is_ws(c)) {
+ last_char = c;
+ buffer_->consume(consumed);
+ return Process::CONTINUE;
+ }
+ ++count;
+ handle_ws(c);
+ consumed = offset;
+ }
+ }
+
+ Process process_space(CommandItem& item) {
+ // S ::= (#x20 | #x9 | #xD | #xA)+
+ // item.offset is only used to count spaces. We consume each space as it
+ // is found so no offset in buffer.
+ uint32_t unused;
+ auto ret = consume_space(item.offset, unused);
+ if (ret != Process::CONTINUE)
+ return ret;
+
+ if (item.offset == 0)
+ return no_match(item);
+
+ add_if_more(item);
+ return Process::CONTINUE;
+ }
+
+ void add_if_more(CommandItem const& item) {
+ switch (item.count) {
+ case Count::ONE:
+ break;
+ case Count::ONE_OR_MANY:
+ cmds_.emplace_back(item.command, Count::ZERO_OR_MANY);
+ break;
+ case Count::ZERO_OR_ONE:
+ break;
+ case Count::ZERO_OR_MANY:
+ cmds_.emplace_back(item.command, item.count);
+ }
+ }
+
+ Match find(std::string_view str, std::size_t& offset) {
+ auto data = buffer_->rspan(offset + str.size());
+ std::size_t i = 0;
+ while (offset < data.size()) {
+ if (str[i] == data[offset]) {
+ ++i;
+ if (i == str.size()) {
+ offset -= i;
+ return Match::FULL_MATCH;
+ }
+ } else {
+ i = 0;
+ }
+ ++offset;
+ }
+ if (i > 0) {
+ offset -= i;
+ return Match::PARTIAL_MATCH;
+ }
+ return Match::NO_MATCH;
+ }
+
+ Match match(std::string_view str, std::size_t offset = 0) {
+ auto data = buffer_->rspan(offset + str.size());
+ if (data.size() <= offset)
+ return Match::PARTIAL_MATCH;
+ auto const avail = std::min(str.size(), data.size() - offset);
+ for (std::size_t i = 0; i < avail; ++i) {
+ if (str[i] != data[offset + i])
+ return Match::NO_MATCH;
+ }
+ if (avail < str.size())
+ return Match::PARTIAL_MATCH;
+ return Match::FULL_MATCH;
+ }
+
+ Match match_consume(std::string_view str) {
+ auto ret = match(str);
+ if (ret == Match::FULL_MATCH)
+ buffer_->consume(str.size());
+ return ret;
+ }
+
+ Match match_s() {
+ auto data = buffer_->rspan(4);
+ std::size_t offset = 0;
+ auto c = utf::read8(data, offset);
+ if (c == utf::NEED_MORE)
+ return data.size() == 0 ? Match::PARTIAL_MATCH : Match::NO_MATCH;
+ if (c == utf::INVALID)
+ return Match::NO_MATCH;
+ if (!valid_char(c) || !is_ws(c))
+ return Match::NO_MATCH;
+ return Match::FULL_MATCH;
+ }
+
+ Process no_match(CommandItem const& item) {
+ switch (item.count) {
+ case Count::ONE:
+ case Count::ONE_OR_MANY:
+ delegate_->error(std::format("Expected {}",
+ command_name(item.command)));
+ return Process::ERROR;
+ case Count::ZERO_OR_ONE:
+ case Count::ZERO_OR_MANY:
+ break;
+ }
+ return Process::CONTINUE;
+ }
+
+ void handle_ws(uint32_t c) {
+ if (c == '\n') {
+ ++line_;
+ column_ = 0;
+ } else {
+ ++column_;
+ }
+ }
+
+ Process invalid_char(std::span<uint8_t const> data, std::size_t offset) {
+ delegate_->error(std::format("Invalid char {:02x}", data[offset]));
+ return Process::ERROR;
+ }
+
+ static std::string_view command_name(Command command) {
+ switch (command) {
+ case Command::MISC:
+ return "misc"sv;
+ case Command::FILL_BUFFER:
+ return "more data"sv;
+ case Command::ELEMENT:
+ return "element"sv;
+ case Command::SPACE:
+ return "whitespace"sv;
+ case Command::COMMENT:
+ return "comment"sv;
+ case Command::PROCESSING_INSTRUCTION:
+ return "processing instruction"sv;
+ case Command::XMLDECL:
+ return "xml declaration"sv;
+ case Command::ATTRIBUTE:
+ return "attribute"sv;
+ case Command::ATTRIBUTE_VALUE:
+ return "attribute value"sv;
+ case Command::NAME:
+ return "name"sv;
+ case Command::EQUAL:
+ return "equal sign (=)"sv;
+ case Command::START_OR_EMPTY_TAG:
+ return "element"sv;
+ case Command::END_TAG:
+ return "end tag"sv;
+ }
+ assert(false);
+ return {};
+ }
+
std::shared_ptr<Delegate> delegate_;
std::shared_ptr<DecoderFactory> decoder_factory_;
std::unique_ptr<Decoder> decoder_;
- std::size_t default_buffer_size_;
- std::size_t max_buffer_size_;
+ bool const forced_decoder_;
+ std::unique_ptr<Buffer> buffer_;
+ Entities entities_;
+ std::vector<CommandItem> cmds_;
+ std::vector<StackItem> stack_;
+ uint64_t line_{1};
+ uint64_t column_{0};
};
} // namespace
@@ -117,9 +1196,9 @@ std::unique_ptr<Processor> create_processor(
decoder_factory.get());
}
- std::size_t default_buffer_size = 8192;
+ std::size_t default_buffer_size = kDefaultBufferSize;
if (opt_default_buffer_size.has_value())
- default_buffer_size = std::max(static_cast<std::size_t>(128),
+ default_buffer_size = std::max(kMinBufferSize,
opt_default_buffer_size.value());
// This value is documented in public headers. Do NOT change.
std::size_t max_buffer_size = 10 * 1024 * 1024;
@@ -136,7 +1215,8 @@ std::unique_ptr<Processor> create_processor(
max_buffer_size);
}
-std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate) {
+std::unique_ptr<Processor>
+Processor::create(std::shared_ptr<Delegate> delegate) {
return create_processor(std::move(delegate), nullptr,
std::nullopt, std::nullopt, std::nullopt);
}
diff --git a/sax/src/utils.cc b/sax/src/utils.cc
index f0366d5..e3a53b1 100644
--- a/sax/src/utils.cc
+++ b/sax/src/utils.cc
@@ -9,7 +9,7 @@ namespace sax {
namespace {
-std::string cleanup_encoding(std::string const& str) {
+std::string cleanup_encoding(std::string_view str) {
std::string ret;
ret.reserve(str.size());
for (auto c : str) {
@@ -29,29 +29,29 @@ std::string cleanup_encoding(std::string const& str) {
// Names inspired by:
// https://www.iana.org/assignments/character-sets/character-sets.xhtml
std::unique_ptr<Decoder> pick_decoder_for_encoding(
- std::string const& encoding, DecoderFactory* factory) {
+ std::string_view encoding, DecoderFactory* factory) {
auto clean_enc = cleanup_encoding(encoding);
- if (clean_enc == "utf-8" || clean_enc == "utf8") {
+ if (clean_enc == "utf-8" || clean_enc == "utf8")
return create_utf8_decoder();
- }
- if (clean_enc == "utf-16" || clean_enc == "utf16") {
+
+ if (clean_enc == "utf-16" || clean_enc == "utf16")
return create_utf16_decoder();
- }
- if (clean_enc == "utf-16be" || clean_enc == "utf16be") {
+
+ if (clean_enc == "utf-16be" || clean_enc == "utf16be")
return create_utf16be_decoder();
- }
- if (clean_enc == "utf-16le" || clean_enc == "utf16le") {
+
+ if (clean_enc == "utf-16le" || clean_enc == "utf16le")
return create_utf16le_decoder();
- }
- if (clean_enc == "utf-32" || clean_enc == "utf32") {
+
+ if (clean_enc == "utf-32" || clean_enc == "utf32")
return create_utf32_decoder();
- }
- if (clean_enc == "utf-32be" || clean_enc == "utf32be") {
+
+ if (clean_enc == "utf-32be" || clean_enc == "utf32be")
return create_utf32be_decoder();
- }
- if (clean_enc == "utf-32le" || clean_enc == "utf32le") {
+
+ if (clean_enc == "utf-32le" || clean_enc == "utf32le")
return create_utf32le_decoder();
- }
+
if (clean_enc == "ascii" || clean_enc == "us-ascii" ||
clean_enc == "usascii" || clean_enc == "iso-ir-6" ||
clean_enc == "ansi-x3-4-1968" || clean_enc == "ansi-x3-4-1986" ||
@@ -59,9 +59,10 @@ std::unique_ptr<Decoder> pick_decoder_for_encoding(
clean_enc == "us" || clean_enc == "ibm367" || clean_enc == "cp367") {
return create_ascii_decoder();
}
- if (factory) {
+
+ if (factory)
return factory->create(encoding);
- }
+
return nullptr;
}
diff --git a/sax/src/utils.hh b/sax/src/utils.hh
index 206d003..074f0c0 100644
--- a/sax/src/utils.hh
+++ b/sax/src/utils.hh
@@ -4,7 +4,7 @@
#include "macros.hh"
#include <memory>
-#include <string>
+#include <string_view>
namespace modxml {
namespace sax {
@@ -13,7 +13,7 @@ class Decoder;
class DecoderFactory;
std::unique_ptr<Decoder> HIDDEN pick_decoder_for_encoding(
- std::string const& encoding,
+ std::string_view encoding,
DecoderFactory* factory);
} // namespace sax
diff --git a/sax/tst/test_buffer.cc b/sax/tst/test_buffer.cc
new file mode 100644
index 0000000..13bc6d4
--- /dev/null
+++ b/sax/tst/test_buffer.cc
@@ -0,0 +1,272 @@
+#include "buffer.hh"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace {
+
+enum class BufferType {
+ FIXED,
+ DYNAMIC,
+};
+
+class BufferTest : public testing::TestWithParam<BufferType> {
+ protected:
+ std::unique_ptr<modxml::sax::Buffer> make_buffer(std::size_t size) {
+ switch (GetParam()) {
+ case BufferType::FIXED:
+ return modxml::sax::make_buffer(size, size);
+ case BufferType::DYNAMIC:
+ return modxml::sax::make_buffer(size / 2, size);
+ }
+ return nullptr;
+ }
+};
+
+std::array<uint8_t, 10> AAAAAAAAAA{
+ 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'};
+std::array<uint8_t, 5> BBBBB{
+ 'B', 'B', 'B', 'B', 'B'};
+
+} // namespace
+
+TEST_P(BufferTest, sanity) {
+ auto buf = make_buffer(10);
+ EXPECT_TRUE(buf->empty());
+ EXPECT_FALSE(buf->full());
+
+ EXPECT_TRUE(buf->write_all(AAAAAAAAAA));
+ EXPECT_TRUE(buf->full());
+ EXPECT_FALSE(buf->empty());
+
+ EXPECT_FALSE(buf->write_all(AAAAAAAAAA));
+
+ std::array<uint8_t, 10> tmp10;
+ EXPECT_TRUE(buf->read_all(tmp10));
+ EXPECT_THAT(tmp10, testing::ContainerEq(AAAAAAAAAA));
+ EXPECT_TRUE(buf->empty());
+ EXPECT_FALSE(buf->full());
+
+ EXPECT_TRUE(buf->write_all(BBBBB));
+ EXPECT_FALSE(buf->full());
+ EXPECT_FALSE(buf->empty());
+
+ EXPECT_EQ(5u, buf->write(AAAAAAAAAA));
+ EXPECT_TRUE(buf->full());
+ EXPECT_FALSE(buf->empty());
+
+ std::array<uint8_t, 3> tmp3;
+ EXPECT_TRUE(buf->read_all(tmp3));
+ EXPECT_THAT(tmp3, testing::ElementsAre('B', 'B', 'B'));
+
+ EXPECT_EQ(3u, buf->write(BBBBB));
+
+ EXPECT_TRUE(buf->read_all(tmp3));
+ EXPECT_THAT(tmp3, testing::ElementsAre('B', 'B', 'A'));
+
+ std::array<uint8_t, 5> tmp5;
+ EXPECT_TRUE(buf->read_all(tmp5));
+ EXPECT_THAT(tmp5, testing::ElementsAre('A', 'A', 'A', 'A', 'B'));
+
+ EXPECT_FALSE(buf->read_all(tmp3));
+ tmp3[2] = 'X';
+ EXPECT_EQ(2u, buf->read(tmp3));
+ EXPECT_THAT(tmp3, testing::ElementsAre('B', 'B', 'X'));
+}
+
+TEST_P(BufferTest, noop) {
+ auto buf = make_buffer(10);
+ EXPECT_TRUE(buf->empty());
+
+ std::array<uint8_t, 0> empty;
+ EXPECT_EQ(0u, buf->write(empty));
+ EXPECT_EQ(0u, buf->read(empty));
+
+ EXPECT_TRUE(buf->write_all(empty));
+ EXPECT_TRUE(buf->read_all(empty));
+
+ buf->commit(0);
+ buf->consume(0);
+
+ EXPECT_TRUE(buf->empty());
+}
+
+TEST_P(BufferTest, one_byte_filler) {
+ auto buf = make_buffer(10);
+
+ std::array<uint8_t, 1> tmp1;
+ uint8_t out = 0;
+ for (uint8_t in = 0; in <= 20; ++in) {
+ tmp1[0] = in;
+ EXPECT_TRUE(buf->write_all(tmp1));
+ if (in >= 9) {
+ EXPECT_TRUE(buf->read_all(tmp1));
+ EXPECT_EQ(tmp1[0], out);
+ ++out;
+ }
+ }
+ for (; out <= 20; ++out) {
+ EXPECT_TRUE(buf->read_all(tmp1));
+ EXPECT_EQ(tmp1[0], out);
+ }
+ EXPECT_TRUE(buf->empty());
+}
+
+TEST_P(BufferTest, read_wrap) {
+ auto buf = make_buffer(10);
+
+ EXPECT_TRUE(buf->write_all(BBBBB));
+ EXPECT_EQ(5u, buf->write(AAAAAAAAAA));
+
+ std::array<uint8_t, 5> tmp5;
+ EXPECT_TRUE(buf->read_all(tmp5));
+ EXPECT_THAT(tmp5, testing::ContainerEq(BBBBB));
+
+ EXPECT_EQ(5u, buf->write(AAAAAAAAAA));
+
+ std::array<uint8_t, 10> tmp10;
+ EXPECT_TRUE(buf->read_all(tmp10));
+ EXPECT_THAT(tmp10, testing::ContainerEq(AAAAAAAAAA));
+}
+
+TEST_P(BufferTest, skip_wrap) {
+ auto buf = make_buffer(10);
+
+ EXPECT_TRUE(buf->write_all(BBBBB));
+ EXPECT_EQ(5u, buf->write(AAAAAAAAAA));
+
+ buf->consume(5);
+ EXPECT_FALSE(buf->empty());
+
+ EXPECT_EQ(5u, buf->write(AAAAAAAAAA));
+
+ buf->consume(10);
+ EXPECT_TRUE(buf->empty());
+}
+
+TEST_P(BufferTest, write_wrap) {
+ auto buf = make_buffer(12);
+
+ EXPECT_TRUE(buf->write_all(BBBBB));
+
+ std::array<uint8_t, 3> tmp3;
+ EXPECT_TRUE(buf->read_all(tmp3));
+ EXPECT_THAT(tmp3, testing::ElementsAre('B', 'B', 'B'));
+
+ EXPECT_TRUE(buf->write_all(AAAAAAAAAA));
+
+ std::array<uint8_t, 12> tmp12;
+ EXPECT_EQ(12u, buf->read(tmp12));
+ EXPECT_THAT(tmp12, testing::ElementsAre(
+ 'B', 'B', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'));
+}
+
+TEST_P(BufferTest, read_wrap2) {
+ auto buf = make_buffer(12);
+
+ EXPECT_TRUE(buf->write_all(AAAAAAAAAA));
+
+ std::array<uint8_t, 7> tmp7;
+ EXPECT_TRUE(buf->read_all(tmp7));
+ EXPECT_THAT(tmp7, testing::ElementsAre('A', 'A', 'A', 'A', 'A', 'A', 'A'));
+
+ EXPECT_EQ(5u, buf->write(BBBBB));
+ EXPECT_EQ(4u, buf->write(BBBBB));
+
+ std::array<uint8_t, 12> tmp12;
+ EXPECT_TRUE(buf->read_all(tmp12));
+ EXPECT_THAT(tmp12, testing::ElementsAre(
+ 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'));
+}
+
+TEST(Buffer, dynamic_resize) {
+ auto buf = modxml::sax::make_buffer(10, 1000);
+
+ std::array<uint8_t, 30> tmp30;
+ for (uint8_t i = 0; i < 30; ++i)
+ tmp30[i] = i;
+
+ EXPECT_TRUE(buf->write_all(tmp30));
+ EXPECT_TRUE(buf->write_all(tmp30));
+
+ std::array<uint8_t, 60> tmp60;
+ EXPECT_TRUE(buf->read_all(tmp60));
+ for (uint8_t i = 0; i < 60; ++i)
+ EXPECT_EQ(i % 30, tmp60[i]) << i;
+}
+
+TEST(Buffer, dynamic_overalloc) {
+ // This test can fail, but in most configurations trying to allocate
+ // std::numeric_limits<std::size_t>::max() will fail.
+ auto buf = modxml::sax::make_buffer(10, std::numeric_limits<std::size_t>::max());
+ EXPECT_FALSE(buf->wspan(10000).empty());
+ EXPECT_TRUE(buf->wspan(std::numeric_limits<std::size_t>::max()).empty());
+}
+
+TEST_P(BufferTest, modify) {
+ auto buf = make_buffer(10);
+
+ EXPECT_TRUE(buf->write_all(AAAAAAAAAA));
+
+ auto span = buf->mspan(5);
+ EXPECT_EQ(10u, span.size());
+ auto len = std::min(static_cast<std::size_t>(5), span.size());
+ for (uint8_t i = 0; i < len; ++i)
+ span[i] = 'C';
+
+ std::array<uint8_t, 10> tmp10;
+ EXPECT_TRUE(buf->read_all(tmp10));
+ EXPECT_THAT(tmp10, testing::ElementsAre(
+ 'C', 'C', 'C', 'C', 'C', 'A', 'A', 'A', 'A', 'A'));
+}
+
+TEST_P(BufferTest, uncommit) {
+ auto buf = make_buffer(10);
+
+ EXPECT_TRUE(buf->write_all(BBBBB));
+
+ EXPECT_EQ(0u, buf->uncommit(0));
+
+ EXPECT_EQ(5u, buf->write(AAAAAAAAAA));
+
+ std::array<uint8_t, 2> tmp2;
+ EXPECT_TRUE(buf->read_all(tmp2));
+ EXPECT_THAT(tmp2, testing::ElementsAre('B', 'B'));
+
+ EXPECT_EQ(3u, buf->uncommit(3));
+ std::array<uint8_t, 5> tmp5;
+ EXPECT_TRUE(buf->read_all(tmp5));
+ EXPECT_THAT(tmp5, testing::ElementsAre('B', 'B', 'B', 'A', 'A'));
+
+ EXPECT_EQ(0u, buf->uncommit(2));
+}
+
+TEST_P(BufferTest, uncommit_wrap) {
+ auto buf = make_buffer(10);
+
+ EXPECT_TRUE(buf->write_all(AAAAAAAAAA));
+ std::array<uint8_t, 5> tmp5;
+ EXPECT_TRUE(buf->read_all(tmp5));
+
+ EXPECT_TRUE(buf->write_all(BBBBB));
+
+ EXPECT_EQ(8u, buf->uncommit(8));
+ std::array<uint8_t, 2> tmp2;
+ EXPECT_TRUE(buf->read_all(tmp2));
+ EXPECT_THAT(tmp2, testing::ElementsAre('A', 'A'));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ BufferTests,
+ BufferTest,
+ testing::Values(BufferType::FIXED, BufferType::DYNAMIC),
+ [](auto& info) {
+ switch (info.param) {
+ case BufferType::FIXED:
+ return "fixed";
+ case BufferType::DYNAMIC:
+ return "dynamic";
+ }
+ return "";
+ }
+);
diff --git a/sax/tst/test_decoder.cc b/sax/tst/test_decoder.cc
new file mode 100644
index 0000000..86f230b
--- /dev/null
+++ b/sax/tst/test_decoder.cc
@@ -0,0 +1,242 @@
+#include "sax_decoder.hh"
+#include "sax_decoder_factory.hh"
+#include "sax_processor.hh"
+#include "sax_delegate.hh"
+
+#include <memory>
+#include <gtest/gtest.h>
+
+namespace {
+
+class TestDelegate : public modxml::sax::Delegate {
+ public:
+ ~TestDelegate() override = default;
+
+ void empty_element(std::string_view name,
+ modxml::sax::Attributes const&) override {
+ EXPECT_EQ(name, "root");
+ if (name == "root") {
+ EXPECT_FALSE(have_root_);
+ have_root_ = true;
+ }
+ }
+
+ void error(std::string_view message) override {
+ have_error_ = true;
+ FAIL() << message;
+ }
+
+ bool have_root() const { return have_root_; }
+
+ bool have_error() const { return have_error_; }
+
+ private:
+ bool have_root_{false};
+ bool have_error_{false};
+};
+
+bool process_all(modxml::sax::Processor& processor,
+ TestDelegate& delegate,
+ std::span<uint8_t const> data) {
+ std::size_t offset = 0;
+ while (offset < data.size()) {
+ auto consumed = processor.process(data, offset);
+ if (consumed == 0 || delegate.have_error())
+ return false;
+ offset += consumed;
+ }
+ return true;
+}
+
+} // namespace
+
+TEST(sax, decoder_utf8) {
+ auto delegate = std::make_shared<TestDelegate>();
+ auto processor = modxml::sax::Processor::create(delegate);
+ std::string input = R"(<?xml version="1.0" encoding="utf-8"?><root />)";
+ std::cerr << input << std::endl;
+ EXPECT_TRUE(process_all(
+ *processor.get(),
+ *delegate.get(),
+ std::span<uint8_t const>(reinterpret_cast<uint8_t const*>(input.data()),
+ input.size())));
+ EXPECT_TRUE(delegate->have_root());
+}
+
+TEST(sax, decoder_utf8_bom) {
+ auto delegate = std::make_shared<TestDelegate>();
+ auto processor = modxml::sax::Processor::create(delegate);
+ std::string input =
+ "\xef\xbb\xbf" R"(<?xml version="1.0" encoding="utf-8"?><root />)";
+ std::cerr << input << std::endl;
+ EXPECT_TRUE(process_all(
+ *processor.get(),
+ *delegate.get(),
+ std::span<uint8_t const>(reinterpret_cast<uint8_t const*>(input.data()),
+ input.size())));
+ EXPECT_TRUE(delegate->have_root());
+}
+
+TEST(sax, decoder_utf16) {
+ auto delegate = std::make_shared<TestDelegate>();
+ auto processor = modxml::sax::Processor::create(delegate);
+ std::u16string input = uR"(<?xml version="1.0" encoding="utf-16"?><root />)";
+ EXPECT_TRUE(process_all(
+ *processor.get(),
+ *delegate.get(),
+ std::span<uint8_t const>(reinterpret_cast<uint8_t const*>(input.data()),
+ input.size() * sizeof(char16_t))));
+ EXPECT_TRUE(delegate->have_root());
+}
+
+TEST(sax, decoder_utf16be) {
+ auto delegate = std::make_shared<TestDelegate>();
+ auto processor = modxml::sax::Processor::create(delegate);
+ std::u16string str = uR"(<?xml version="1.0" encoding="utf-16"?><root />)";
+ std::vector<uint8_t> input;
+ for (char16_t c : str) {
+ input.push_back(c >> 8);
+ input.push_back(c & 0xff);
+ }
+ EXPECT_TRUE(process_all(
+ *processor.get(),
+ *delegate.get(),
+ std::span<uint8_t const>(input.data(), input.size())));
+ EXPECT_TRUE(delegate->have_root());
+}
+
+TEST(sax, decoder_utf16le) {
+ auto delegate = std::make_shared<TestDelegate>();
+ auto processor = modxml::sax::Processor::create(delegate);
+ std::u16string str = uR"(<?xml version="1.0" encoding="utf-16"?><root />)";
+ std::vector<uint8_t> input;
+ for (char16_t c : str) {
+ input.push_back(c & 0xff);
+ input.push_back(c >> 8);
+ }
+ EXPECT_TRUE(process_all(
+ *processor.get(),
+ *delegate.get(),
+ std::span<uint8_t const>(input.data(), input.size())));
+ EXPECT_TRUE(delegate->have_root());
+}
+
+TEST(sax, decoder_utf16be_bom) {
+ auto delegate = std::make_shared<TestDelegate>();
+ auto processor = modxml::sax::Processor::create(delegate);
+ std::u16string str =
+ u"\ufffe" uR"(<?xml version="1.0" encoding="utf-16"?><root />)";
+ std::vector<uint8_t> input;
+ for (char16_t c : str) {
+ input.push_back(c >> 8);
+ input.push_back(c & 0xff);
+ }
+ EXPECT_TRUE(process_all(
+ *processor.get(),
+ *delegate.get(),
+ std::span<uint8_t const>(input.data(), input.size())));
+ EXPECT_TRUE(delegate->have_root());
+}
+
+TEST(sax, decoder_utf16le_bom) {
+ auto delegate = std::make_shared<TestDelegate>();
+ auto processor = modxml::sax::Processor::create(delegate);
+ std::u16string str =
+ u"\ufffe" uR"(<?xml version="1.0" encoding="utf-16"?><root />)";
+ std::vector<uint8_t> input;
+ for (char16_t c : str) {
+ input.push_back(c & 0xff);
+ input.push_back(c >> 8);
+ }
+ EXPECT_TRUE(process_all(
+ *processor.get(),
+ *delegate.get(),
+ std::span<uint8_t const>(input.data(), input.size())));
+ EXPECT_TRUE(delegate->have_root());
+}
+
+TEST(sax, decoder_utf32) {
+ auto delegate = std::make_shared<TestDelegate>();
+ auto processor = modxml::sax::Processor::create(delegate);
+ std::u32string input = UR"(<?xml version="1.0" encoding="utf-32"?><root />)";
+ EXPECT_TRUE(process_all(
+ *processor.get(),
+ *delegate.get(),
+ std::span<uint8_t const>(reinterpret_cast<uint8_t const*>(input.data()),
+ input.size() * sizeof(char32_t))));
+ EXPECT_TRUE(delegate->have_root());
+}
+
+TEST(sax, decoder_utf32be) {
+ auto delegate = std::make_shared<TestDelegate>();
+ auto processor = modxml::sax::Processor::create(delegate);
+ std::u32string str = UR"(<?xml version="1.0" encoding="utf-32"?><root />)";
+ std::vector<uint8_t> input;
+ for (char32_t c : str) {
+ input.push_back(c >> 24);
+ input.push_back((c >> 16) & 0xff);
+ input.push_back((c >> 8) & 0xff);
+ input.push_back(c & 0xff);
+ }
+ EXPECT_TRUE(process_all(
+ *processor.get(),
+ *delegate.get(),
+ std::span<uint8_t const>(input.data(), input.size())));
+ EXPECT_TRUE(delegate->have_root());
+}
+
+TEST(sax, decoder_utf32le) {
+ auto delegate = std::make_shared<TestDelegate>();
+ auto processor = modxml::sax::Processor::create(delegate);
+ std::u32string str = UR"(<?xml version="1.0" encoding="utf-32"?><root />)";
+ std::vector<uint8_t> input;
+ for (char32_t c : str) {
+ input.push_back(c & 0xff);
+ input.push_back((c >> 8) & 0xff);
+ input.push_back((c >> 16) & 0xff);
+ input.push_back(c >> 24);
+ }
+ EXPECT_TRUE(process_all(
+ *processor.get(),
+ *delegate.get(),
+ std::span<uint8_t const>(input.data(), input.size())));
+ EXPECT_TRUE(delegate->have_root());
+}
+
+TEST(sax, decoder_utf32be_bom) {
+ auto delegate = std::make_shared<TestDelegate>();
+ auto processor = modxml::sax::Processor::create(delegate);
+ std::u32string str =
+ U"\ufffe" UR"(<?xml version="1.0" encoding="utf-32"?><root />)";
+ std::vector<uint8_t> input;
+ for (char32_t c : str) {
+ input.push_back(c >> 24);
+ input.push_back((c >> 16) & 0xff);
+ input.push_back((c >> 8) & 0xff);
+ input.push_back(c & 0xff);
+ }
+ EXPECT_TRUE(process_all(
+ *processor.get(),
+ *delegate.get(),
+ std::span<uint8_t const>(input.data(), input.size())));
+ EXPECT_TRUE(delegate->have_root());
+}
+
+TEST(sax, decoder_utf32le_bom) {
+ auto delegate = std::make_shared<TestDelegate>();
+ auto processor = modxml::sax::Processor::create(delegate);
+ std::u32string str =
+ U"\ufffe" R"(<?xml version="1.0" encoding="utf-32"?><root />)";
+ std::vector<uint8_t> input;
+ for (char32_t c : str) {
+ input.push_back(c & 0xff);
+ input.push_back((c >> 8) & 0xff);
+ input.push_back((c >> 16) & 0xff);
+ input.push_back(c >> 24);
+ }
+ EXPECT_TRUE(process_all(
+ *processor.get(),
+ *delegate.get(),
+ std::span<uint8_t const>(input.data(), input.size())));
+ EXPECT_TRUE(delegate->have_root());
+}
diff --git a/utf/inc/utf16.hh b/utf/inc/utf16.hh
index 344b1a2..b9229bc 100644
--- a/utf/inc/utf16.hh
+++ b/utf/inc/utf16.hh
@@ -4,27 +4,29 @@
#include "macros.hh"
#include <cstdint>
-#include <string_view>
+#include <span>
namespace utf {
-/* Read one unicode codepoint from UTF-16 BigEndian encoded data if possible.
+/**
+ * Read one unicode codepoint from UTF-16 BigEndian encoded data if possible.
* If successfull offset is incremented to point to next codepoint.
* Will fail:
* - not enough data is left in data given offset, returns NEED_MORE.
* - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs,
* returns INVALID.
*/
-uint32_t HIDDEN read16be(std::string_view data, std::size_t& offset);
+uint32_t HIDDEN read16be(std::span<uint8_t const> data, std::size_t& offset);
-/* Read one unicode codepoint from UTF-16 LittleEndian encoded data if possible.
+/**
+ * Read one unicode codepoint from UTF-16 LittleEndian encoded data if possible.
* If successfull offset is incremented to point to next codepoint.
* Will fail:
* - not enough data is left in data given offset, returns NEED_MORE.
* - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs,
* returns INVALID.
*/
-uint32_t HIDDEN read16le(std::string_view data, std::size_t& offset);
+uint32_t HIDDEN read16le(std::span<uint8_t const> data, std::size_t& offset);
} // namespace utf
diff --git a/utf/inc/utf32.hh b/utf/inc/utf32.hh
index 2d3088e..4ee5eac 100644
--- a/utf/inc/utf32.hh
+++ b/utf/inc/utf32.hh
@@ -4,25 +4,27 @@
#include "macros.hh"
#include <cstdint>
-#include <string_view>
+#include <span>
namespace utf {
-/* Read one unicode codepoint from UTF-32 BigEndian encoded data if possible.
+/**
+ * Read one unicode codepoint from UTF-32 BigEndian encoded data if possible.
* If successfull offset is incremented to point to next codepoint.
* Will fail:
* - not enough data is left in data given offset, returns NEED_MORE.
* - data is not valid UTF-32, ie. outside valid ranges, returns INVALID.
*/
-uint32_t HIDDEN read32be(std::string_view data, std::size_t& offset);
+uint32_t HIDDEN read32be(std::span<uint8_t const> data, std::size_t& offset);
-/* Read one unicode codepoint from UTF-32 LittleEndian encoded data if possible.
+/**
+ * Read one unicode codepoint from UTF-32 LittleEndian encoded data if possible.
* If successfull offset is incremented to point to next codepoint.
* Will fail:
* - not enough data is left in data given offset, returns NEED_MORE.
* - data is not valid UTF-32, ie. outside valid ranges, returns INVALID.
*/
-uint32_t HIDDEN read32le(std::string_view data, std::size_t& offset);
+uint32_t HIDDEN read32le(std::span<uint8_t const> data, std::size_t& offset);
} // namespace utf
diff --git a/utf/inc/utf8.hh b/utf/inc/utf8.hh
index a3ea84a..7735ecd 100644
--- a/utf/inc/utf8.hh
+++ b/utf/inc/utf8.hh
@@ -4,18 +4,29 @@
#include "macros.hh"
#include <cstdint>
-#include <string_view>
+#include <span>
namespace utf {
-/* Read one unicode codepoint from UTF-8 encoded data if possible.
- * If successfull offset is incremented to point to next codepoint.
+/**
+ * Read one unicode codepoint from UTF-8 encoded data if possible.
+ * If successful, offset is incremented to point to next codepoint.
* Will fail:
* - not enough data is left in data given offset, returns NEED_MORE.
* - data is not valid UTF-8, this includes overlong encodings and
* invalid unicode code points, returns INVALID.
*/
-uint32_t HIDDEN read8(std::string_view data, std::size_t& offset);
+uint32_t HIDDEN read8(std::span<uint8_t const> data, std::size_t& offset);
+
+/**
+ * Write one unicode codepoint to UTF-8 encoded data if possible.
+ * If successful, offset is incremented to the end of the written data
+ * and true is returned.
+ * If not successful, offset is not incremented and false is returned.
+ * data is not modified.
+ */
+bool HIDDEN write8(uint32_t codepoint, std::span<uint8_t> data,
+ std::size_t& offset);
} // namespace utf
diff --git a/utf/meson.build b/utf/meson.build
index 64db6ff..051ddd1 100644
--- a/utf/meson.build
+++ b/utf/meson.build
@@ -23,16 +23,16 @@ test('utf8',
executable(
'test_utf8',
sources: ['tst/test_utf8.cc'],
- dependencies: [utf_dep, gtest_dep]))
+ dependencies: [utf_dep, gmock_dep, gtest_dep]))
test('utf16',
executable(
'test_utf16',
sources: ['tst/test_utf16.cc'],
- dependencies: [utf_dep, gtest_dep]))
+ dependencies: [utf_dep, gmock_dep, gtest_dep]))
test('utf32',
executable(
'test_utf32',
sources: ['tst/test_utf32.cc'],
- dependencies: [utf_dep, gtest_dep]))
+ dependencies: [utf_dep, gmock_dep, gtest_dep]))
diff --git a/utf/src/utf16.cc b/utf/src/utf16.cc
index 43595bf..623c1be 100644
--- a/utf/src/utf16.cc
+++ b/utf/src/utf16.cc
@@ -16,7 +16,7 @@ inline bool is_low_surrogate(uint16_t c) {
} // namespace
-uint32_t read16be(std::string_view data, std::size_t& offset) {
+uint32_t read16be(std::span<uint8_t const> data, std::size_t& offset) {
if (offset > data.size() || data.size() - offset < 2)
return NEED_MORE;
uint16_t c = static_cast<uint16_t>(data[offset]) << 8
@@ -40,7 +40,7 @@ uint32_t read16be(std::string_view data, std::size_t& offset) {
return c;
}
-uint32_t read16le(std::string_view data, std::size_t& offset) {
+uint32_t read16le(std::span<uint8_t const> data, std::size_t& offset) {
if (offset > data.size() || data.size() - offset < 2)
return NEED_MORE;
uint16_t c = static_cast<uint16_t>(data[offset + 1]) << 8
diff --git a/utf/src/utf32.cc b/utf/src/utf32.cc
index cfa29b6..e33b0b4 100644
--- a/utf/src/utf32.cc
+++ b/utf/src/utf32.cc
@@ -12,7 +12,7 @@ inline bool valid_codepoint(uint32_t c) {
} // namespace
-uint32_t read32be(std::string_view data, std::size_t& offset) {
+uint32_t read32be(std::span<uint8_t const> data, std::size_t& offset) {
if (offset > data.size() || data.size() - offset < 4)
return NEED_MORE;
uint32_t c = static_cast<uint32_t>(data[offset]) << 24
@@ -26,7 +26,7 @@ uint32_t read32be(std::string_view data, std::size_t& offset) {
return INVALID;
}
-uint32_t read32le(std::string_view data, std::size_t& offset) {
+uint32_t read32le(std::span<uint8_t const> data, std::size_t& offset) {
if (offset > data.size() || data.size() - offset < 4)
return NEED_MORE;
uint32_t c = static_cast<uint32_t>(data[offset + 3]) << 24
diff --git a/utf/src/utf8.cc b/utf/src/utf8.cc
index 54b0296..0e444ae 100644
--- a/utf/src/utf8.cc
+++ b/utf/src/utf8.cc
@@ -12,12 +12,12 @@ inline bool valid_codepoint(uint32_t c) {
} // namespace
-uint32_t read8(std::string_view data, std::size_t& offset) {
+uint32_t read8(std::span<uint8_t const> data, std::size_t& offset) {
if (offset >= data.size())
return NEED_MORE;
uint32_t ret;
uint8_t size;
- switch (static_cast<uint8_t>(data[offset]) >> 4) {
+ switch (data[offset] >> 4) {
case 15:
if (data[offset] & 0x08)
return INVALID;
@@ -65,4 +65,35 @@ uint32_t read8(std::string_view data, std::size_t& offset) {
return ret;
}
+bool write8(uint32_t codepoint, std::span<uint8_t> data, std::size_t& offset) {
+ if (offset >= data.size()) UNLIKELY {
+ return false;
+ }
+ if (codepoint < 0x80) {
+ data[offset++] = codepoint;
+ } else if (codepoint < 0x800) {
+ if (data.size() - offset < 2) UNLIKELY {
+ return false;
+ }
+ data[offset++] = 0xc0 | (codepoint >> 6);
+ data[offset++] = 0x80 | (codepoint & 0x3f);
+ } else if (codepoint < 0x10000) {
+ if (data.size() - offset < 3) UNLIKELY {
+ return false;
+ }
+ data[offset++] = 0xe0 | (codepoint >> 12);
+ data[offset++] = 0x80 | ((codepoint >> 6) & 0x3f);
+ data[offset++] = 0x80 | (codepoint & 0x3f);
+ } else {
+ if (data.size() - offset < 4) UNLIKELY {
+ return false;
+ }
+ data[offset++] = 0xf0 | (codepoint >> 18);
+ data[offset++] = 0x80 | ((codepoint >> 12) & 0x3f);
+ data[offset++] = 0x80 | ((codepoint >> 6) & 0x3f);
+ data[offset++] = 0x80 | (codepoint & 0x3f);
+ }
+ return true;
+}
+
} // namespace utf
diff --git a/utf/tst/test_utf16.cc b/utf/tst/test_utf16.cc
index c17982e..3b3c03c 100644
--- a/utf/tst/test_utf16.cc
+++ b/utf/tst/test_utf16.cc
@@ -2,156 +2,137 @@
#include "utf_error.hh"
+#include <array>
#include <gtest/gtest.h>
TEST(utf16be, sanity) {
- std::string_view str("\x00\x24", 2);
size_t offset = 0;
- auto ret = utf::read16be(str, offset);
+ auto ret = utf::read16be(std::array<uint8_t, 2>({0x00, 0x24}), offset);
EXPECT_EQ('$', ret);
EXPECT_EQ(2, offset);
- str = "\x20\xAC";
offset = 0;
- ret = utf::read16be(str, offset);
+ ret = utf::read16be(std::array<uint8_t, 2>({0x20, 0xAC}), offset);
EXPECT_EQ(0x20AC, ret);
EXPECT_EQ(2, offset);
- str = "\xD8\x01\xDC\x37";
offset = 0;
- ret = utf::read16be(str, offset);
+ ret = utf::read16be(std::array<uint8_t, 4>({0xD8, 0x01, 0xDC, 0x37}), offset);
EXPECT_EQ(0x10437, ret);
EXPECT_EQ(4, offset);
- str = "\xD8\x52\xDF\x62";
offset = 0;
- ret = utf::read16be(str, offset);
+ ret = utf::read16be(std::array<uint8_t, 4>({0xD8, 0x52, 0xDF, 0x62}), offset);
EXPECT_EQ(0x24B62, ret);
EXPECT_EQ(4, offset);
}
TEST(utf16le, sanity) {
- std::string_view str("\x24\x00", 2);
size_t offset = 0;
- auto ret = utf::read16le(str, offset);
+ auto ret = utf::read16le(std::array<uint8_t, 2>({0x24, 0x00}), offset);
EXPECT_EQ('$', ret);
EXPECT_EQ(2, offset);
- str = "\xAC\x20";
offset = 0;
- ret = utf::read16le(str, offset);
+ ret = utf::read16le(std::array<uint8_t, 2>({0xAC, 0x20}), offset);
EXPECT_EQ(0x20AC, ret);
EXPECT_EQ(2, offset);
- str = "\x01\xD8\x37\xDC";
offset = 0;
- ret = utf::read16le(str, offset);
+ ret = utf::read16le(std::array<uint8_t, 4>({0x01, 0xD8, 0x37, 0xDC}), offset);
EXPECT_EQ(0x10437, ret);
EXPECT_EQ(4, offset);
- str = "\x52\xD8\x62\xDF";
offset = 0;
- ret = utf::read16le(str, offset);
+ ret = utf::read16le(std::array<uint8_t, 4>({0x52, 0xD8, 0x62, 0xDF}), offset);
EXPECT_EQ(0x24B62, ret);
EXPECT_EQ(4, offset);
}
TEST(utf16be, bom) {
- std::string_view str("\xFE\xFF\x20\xAC");
+ std::array<uint8_t, 4> data({0xFE, 0xFF, 0x20, 0xAC});
size_t offset = 0;
- auto ret = utf::read16be(str, offset);
+ auto ret = utf::read16be(data, offset);
EXPECT_EQ(0xFEFF, ret);
- ret = utf::read16be(str, offset);
+ ret = utf::read16be(data, offset);
EXPECT_EQ(0x20AC, ret);
- ret = utf::read16be(str, offset);
+ ret = utf::read16be(data, offset);
EXPECT_EQ(utf::NEED_MORE, ret);
- EXPECT_EQ(str.size(), offset);
+ EXPECT_EQ(data.size(), offset);
}
TEST(utf16le, bom) {
- std::string_view str("\xFF\xFE\xAC\x20");
+ std::array<uint8_t, 4> data({0xFF, 0xFE, 0xAC, 0x20});
size_t offset = 0;
- auto ret = utf::read16le(str, offset);
+ auto ret = utf::read16le(data, offset);
EXPECT_EQ(0xFEFF, ret);
- ret = utf::read16le(str, offset);
+ ret = utf::read16le(data, offset);
EXPECT_EQ(0x20AC, ret);
- ret = utf::read16le(str, offset);
+ ret = utf::read16le(data, offset);
EXPECT_EQ(utf::NEED_MORE, ret);
- EXPECT_EQ(str.size(), offset);
+ EXPECT_EQ(data.size(), offset);
}
TEST(utf16be, invalid) {
- std::string_view str("\xD8");
size_t offset = 0;
- auto ret = utf::read16be(str, offset);
+ auto ret = utf::read16be(std::array<uint8_t, 1>({0xD8}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = "";
offset = 0;
- ret = utf::read16be(str, offset);
+ ret = utf::read16be(std::array<uint8_t, 0>(), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = "\xD8\x01";
offset = 0;
- ret = utf::read16be(str, offset);
+ ret = utf::read16be(std::array<uint8_t, 2>({0xD8, 0x01}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = "\xD8\x01\xDC";
offset = 0;
- ret = utf::read16be(str, offset);
+ ret = utf::read16be(std::array<uint8_t, 3>({0xD8, 0x01, 0xDC}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = "\xDC\x37\xD8\x01";
offset = 0;
- ret = utf::read16be(str, offset);
+ ret = utf::read16be(std::array<uint8_t, 4>({0xDC, 0x37, 0xD8, 0x01}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
- str = "\xD8\x01\xD8\x01";
offset = 0;
- ret = utf::read16be(str, offset);
+ ret = utf::read16be(std::array<uint8_t, 4>({0xD8, 0x01, 0xD8, 0x01}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
}
TEST(utf16le, invalid) {
- std::string_view str("\x01");
size_t offset = 0;
- auto ret = utf::read16le(str, offset);
+ auto ret = utf::read16le(std::array<uint8_t, 1>({0x01}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = "";
offset = 0;
- ret = utf::read16le(str, offset);
+ ret = utf::read16le(std::array<uint8_t, 0>(), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = "\x01\xD8";
offset = 0;
- ret = utf::read16le(str, offset);
+ ret = utf::read16le(std::array<uint8_t, 2>({0x01, 0xD8}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = "\x01\xD8\x37";
offset = 0;
- ret = utf::read16le(str, offset);
+ ret = utf::read16le(std::array<uint8_t, 3>({0x01, 0xD8, 0x37}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = "\x37\xDC\x01\xD8";
offset = 0;
- ret = utf::read16le(str, offset);
+ ret = utf::read16le(std::array<uint8_t, 4>({0x37, 0xDC, 0x01, 0xD8}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
- str = "\x01\xD8\x01\xD8";
offset = 0;
- ret = utf::read16le(str, offset);
+ ret = utf::read16le(std::array<uint8_t, 4>({0x01, 0xD8, 0x01, 0xD8}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
}
diff --git a/utf/tst/test_utf32.cc b/utf/tst/test_utf32.cc
index 796b4cd..447b541 100644
--- a/utf/tst/test_utf32.cc
+++ b/utf/tst/test_utf32.cc
@@ -2,144 +2,137 @@
#include "utf_error.hh"
+#include <array>
#include <gtest/gtest.h>
TEST(utf32be, sanity) {
- std::string_view str("\x00\x00\x00\x24", 4);
size_t offset = 0;
- auto ret = utf::read32be(str, offset);
+ auto ret = utf::read32be(
+ std::array<uint8_t, 4>({0x00, 0x00, 0x00, 0x24}), offset);
EXPECT_EQ('$', ret);
EXPECT_EQ(4, offset);
- str = std::string_view("\x00\x00\x20\xAC", 4);
offset = 0;
- ret = utf::read32be(str, offset);
+ ret = utf::read32be(
+ std::array<uint8_t, 4>({0x00, 0x00, 0x20, 0xAC}), offset);
EXPECT_EQ(0x20AC, ret);
EXPECT_EQ(4, offset);
- str = std::string_view("\x00\x01\x04\x37", 4);
offset = 0;
- ret = utf::read32be(str, offset);
+ ret = utf::read32be(
+ std::array<uint8_t, 4>({0x00, 0x01, 0x04, 0x37}), offset);
EXPECT_EQ(0x10437, ret);
EXPECT_EQ(4, offset);
}
TEST(utf32le, sanity) {
- std::string_view str("\x24\x00\x00\x00", 4);
size_t offset = 0;
- auto ret = utf::read32le(str, offset);
+ auto ret = utf::read32le(
+ std::array<uint8_t, 4>({0x24, 0x00, 0x00, 0x00}), offset);
EXPECT_EQ('$', ret);
EXPECT_EQ(4, offset);
- str = std::string_view("\xAC\x20\x00\x00", 4);
offset = 0;
- ret = utf::read32le(str, offset);
+ ret = utf::read32le(
+ std::array<uint8_t, 4>({0xAC, 0x20, 0x00, 0x00}), offset);
EXPECT_EQ(0x20AC, ret);
EXPECT_EQ(4, offset);
- str = std::string_view("\x37\x04\x01\x00", 4);
offset = 0;
- ret = utf::read32le(str, offset);
+ ret = utf::read32le(
+ std::array<uint8_t, 4>({0x37, 0x04, 0x01, 0x00}), offset);
EXPECT_EQ(0x10437, ret);
EXPECT_EQ(4, offset);
}
TEST(utf32be, invalid) {
- std::string_view str("\xFF\xFF\xFF\xFF");
size_t offset = 0;
- auto ret = utf::read32be(str, offset);
+ auto ret = utf::read32be(
+ std::array<uint8_t, 4>({0xFF, 0xFF, 0xFF, 0xFF}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
- str = std::string_view("\x00\x00\xD8\x00", 4);
offset = 0;
- ret = utf::read32be(str, offset);
+ ret = utf::read32be(
+ std::array<uint8_t, 4>({0x00, 0x00, 0xD8, 0x00}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
- str = "";
offset = 0;
- ret = utf::read32be(str, offset);
+ ret = utf::read32be(std::array<uint8_t, 1>({}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = std::string_view("\x00", 1);
offset = 0;
- ret = utf::read32be(str, offset);
+ ret = utf::read32be(std::array<uint8_t, 1>({0x00}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = std::string_view("\x00\x00", 2);
offset = 0;
- ret = utf::read32be(str, offset);
+ ret = utf::read32be(std::array<uint8_t, 2>({0x00, 0x00}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = std::string_view("\x00\x00\x00", 3);
offset = 0;
- ret = utf::read32be(str, offset);
+ ret = utf::read32be(std::array<uint8_t, 3>({0x00, 0x00, 0x00}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
}
TEST(utf32le, invalid) {
- std::string_view str("\xFF\xFF\xFF\xFF");
size_t offset = 0;
- auto ret = utf::read32le(str, offset);
+ auto ret = utf::read32le(
+ std::array<uint8_t, 4>({0xFF, 0xFF, 0xFF, 0xFF}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
- str = std::string_view("\x00\xD8\x00\x00", 4);
offset = 0;
- ret = utf::read32le(str, offset);
+ ret = utf::read32le(
+ std::array<uint8_t, 4>({0x00, 0xD8, 0x00, 0x00}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
- str = "";
offset = 0;
- ret = utf::read32le(str, offset);
+ ret = utf::read32le(std::array<uint8_t, 0>(), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = std::string_view("\x00", 1);
offset = 0;
- ret = utf::read32le(str, offset);
+ ret = utf::read32le(std::array<uint8_t, 1>({0x00}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = std::string_view("\x00\x00", 2);
offset = 0;
- ret = utf::read32le(str, offset);
+ ret = utf::read32le(std::array<uint8_t, 2>({0x00, 0x00}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = std::string_view("\x00\x00\x00", 3);
offset = 0;
- ret = utf::read32le(str, offset);
+ ret = utf::read32le(std::array<uint8_t, 3>({0x00, 0x00, 0x00}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
}
TEST(utf32be, bom) {
- std::string_view str("\x00\x00\xFF\xFE\x00\x00\x20\xAC", 8);
+ std::array<uint8_t, 8> data({0x00, 0x00, 0xFF, 0xFE, 0x00, 0x00, 0x20, 0xAC});
size_t offset = 0;
- auto ret = utf::read32be(str, offset);
+ auto ret = utf::read32be(data, offset);
EXPECT_EQ(0xFFFE, ret);
- ret = utf::read32be(str, offset);
+ ret = utf::read32be(data, offset);
EXPECT_EQ(0x20AC, ret);
- ret = utf::read32be(str, offset);
+ ret = utf::read32be(data, offset);
EXPECT_EQ(utf::NEED_MORE, ret);
- EXPECT_EQ(str.size(), offset);
+ EXPECT_EQ(data.size(), offset);
}
TEST(utf32le, bom) {
- std::string_view str("\xFE\xFF\x00\x00\xAC\x20\x00\x00", 8);
+ std::array<uint8_t, 8> data({0xFE, 0xFF, 0x00, 0x00, 0xAC, 0x20, 0x00, 0x00});
size_t offset = 0;
- auto ret = utf::read32le(str, offset);
+ auto ret = utf::read32le(data, offset);
EXPECT_EQ(0xFFFE, ret);
- ret = utf::read32le(str, offset);
+ ret = utf::read32le(data, offset);
EXPECT_EQ(0x20AC, ret);
- ret = utf::read32le(str, offset);
+ ret = utf::read32le(data, offset);
EXPECT_EQ(utf::NEED_MORE, ret);
- EXPECT_EQ(str.size(), offset);
+ EXPECT_EQ(data.size(), offset);
}
diff --git a/utf/tst/test_utf8.cc b/utf/tst/test_utf8.cc
index 10df969..8bdeba4 100644
--- a/utf/tst/test_utf8.cc
+++ b/utf/tst/test_utf8.cc
@@ -2,187 +2,245 @@
#include "utf_error.hh"
+#include <array>
+#include <gmock/gmock.h>
#include <gtest/gtest.h>
+#include <span>
-TEST(utf8, sanity) {
- std::string_view str("$");
+TEST(utf8, read_sanity) {
size_t offset = 0;
- auto ret = utf::read8(str, offset);
+ auto ret = utf::read8(std::array<uint8_t, 1>({'$'}), offset);
EXPECT_EQ('$', ret);
EXPECT_EQ(1, offset);
- str = "\xC2\xA3";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 2>({0xC2, 0xA3}), offset);
EXPECT_EQ(0xa3, ret);
EXPECT_EQ(2, offset);
- str = "\xD0\x98";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 2>({0xD0, 0x98}), offset);
EXPECT_EQ(0x418, ret);
EXPECT_EQ(2, offset);
- str = "\xE0\xA4\xB9";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 3>({0xE0, 0xA4, 0xB9}), offset);
EXPECT_EQ(0x939, ret);
EXPECT_EQ(3, offset);
- str = "\xE2\x82\xAC";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 3>({0xE2, 0x82, 0xAC}), offset);
EXPECT_EQ(0x20AC, ret);
EXPECT_EQ(3, offset);
- str = "\xED\x95\x9C";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 3>({0xED, 0x95, 0x9C}), offset);
EXPECT_EQ(0xD55C, ret);
EXPECT_EQ(3, offset);
- str = "\xF0\x90\x8D\x88";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 4>({0xF0, 0x90, 0x8D, 0x88}), offset);
EXPECT_EQ(0x10348, ret);
EXPECT_EQ(4, offset);
}
-TEST(utf8, overlong) {
- std::string_view str("\xF0\x82\x82\xAC");
+TEST(utf8, write_sanity) {
+ std::array<uint8_t, 10> out;
size_t offset = 0;
- auto ret = utf::read8(str, offset);
+ EXPECT_TRUE(utf::write8('$', out, offset));
+ EXPECT_THAT(std::span(out).subspan(0, offset),
+ testing::ElementsAre('$'));
+ EXPECT_EQ(1, offset);
+
+ offset = 0;
+ EXPECT_TRUE(utf::write8(0xa3, out, offset));
+ EXPECT_THAT(std::span(out).subspan(0, offset),
+ testing::ElementsAre(0xC2, 0xA3));
+ EXPECT_EQ(2, offset);
+
+ offset = 0;
+ EXPECT_TRUE(utf::write8(0x418, out, offset));
+ EXPECT_THAT(std::span(out).subspan(0, offset),
+ testing::ElementsAre(0xD0, 0x98));
+ EXPECT_EQ(2, offset);
+
+ offset = 0;
+ EXPECT_TRUE(utf::write8(0x939, out, offset));
+ EXPECT_THAT(std::span(out).subspan(0, offset),
+ testing::ElementsAre(0xE0, 0xA4, 0xB9));
+ EXPECT_EQ(3, offset);
+
+ offset = 0;
+ EXPECT_TRUE(utf::write8(0x20AC, out, offset));
+ EXPECT_THAT(std::span(out).subspan(0, offset),
+ testing::ElementsAre(0xE2, 0x82, 0xAC));
+ EXPECT_EQ(3, offset);
+
+ offset = 0;
+ EXPECT_TRUE(utf::write8(0xD55C, out, offset));
+ EXPECT_THAT(std::span(out).subspan(0, offset),
+ testing::ElementsAre(0xED, 0x95, 0x9C));
+ EXPECT_EQ(3, offset);
+
+ offset = 0;
+ EXPECT_TRUE(utf::write8(0x10348, out, offset));
+ EXPECT_THAT(std::span(out).subspan(0, offset),
+ testing::ElementsAre(0xF0, 0x90, 0x8D, 0x88));
+ EXPECT_EQ(4, offset);
+}
+
+TEST(utf8, read_overlong) {
+ size_t offset = 0;
+ auto ret = utf::read8(
+ std::array<uint8_t, 4>({0xF0, 0x82, 0x82, 0xAC}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
- str = "\xE0\x81\x81";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 3>({0xE0, 0x81, 0x81}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
- str = "\xC0\x80";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 2>({0xC0, 0x80}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
}
-TEST(utf8, invalid) {
- std::string_view str("\xED\xB0\x80");
+TEST(utf8, read_invalid) {
size_t offset = 0;
- auto ret = utf::read8(str, offset);
+ auto ret = utf::read8(std::array<uint8_t, 3>({0xED, 0xB0, 0x80}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
- str = "\xFB\xFF\xFF";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 3>({0xFB, 0xFF, 0xFF}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
- str = "\xFF\xFF\xFF\xFF\xFF";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(
+ std::array<uint8_t, 5>({0xFF, 0xFF, 0xFF, 0xFF, 0xFF}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
- str = "";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 0>(), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = "\x80";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 1>({0x80}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
- str = "\xC2";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 1>({0xC2}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = "\xC2\x03";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 2>({0xC2, 0x03}), offset);
EXPECT_EQ(utf::INVALID, ret);
EXPECT_EQ(0, offset);
- str = "\xE0\xA4";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 2>({0xE0, 0xA4}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
- str = "\xF0\x90\x8D";
offset = 0;
- ret = utf::read8(str, offset);
+ ret = utf::read8(std::array<uint8_t, 3>({0xF0, 0x90, 0x8D}), offset);
EXPECT_EQ(utf::NEED_MORE, ret);
EXPECT_EQ(0, offset);
}
-TEST(utf8, multiple1) {
- std::string_view str("\x4D\xC3\xAC\x6E\x68\x20\x6E\xC3\xB3\x69\x20\x74\x69"
- "\xE1\xBA\xBF\x6E\x67\x20\x56\x69\xE1\xBB\x87\x74");
+TEST(utf8, read_multiple1) {
+ std::array<uint8_t, 25> data({
+ 0x4D, 0xC3, 0xAC, 0x6E, 0x68, 0x20, 0x6E, 0xC3, 0xB3, 0x69,
+ 0x20, 0x74, 0x69, 0xE1, 0xBA, 0xBF, 0x6E, 0x67, 0x20, 0x56,
+ 0x69, 0xE1, 0xBB, 0x87, 0x74
+ });
size_t offset = 0;
- auto ret = utf::read8(str, offset);
+ auto ret = utf::read8(data, offset);
EXPECT_EQ('M', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ(0xEC, ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ('n', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ('h', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ(' ', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ('n', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ(0xF3, ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ('i', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ(' ', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ('t', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ('i', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ(0x1EBF, ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ('n', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ('g', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ(' ', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ('V', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ('i', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ(0x1EC7, ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ('t', ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ(utf::NEED_MORE, ret);
- EXPECT_EQ(str.size(), offset);
+ EXPECT_EQ(data.size(), offset);
}
-TEST(utf8, multiple2) {
- std::string_view str("\xF0\xA8\x89\x9F\xE5\x91\x90\xE3\x97\x82\xE8\xB6\x8A");
+TEST(utf8, read_multiple2) {
+ std::array<uint8_t, 13> data({
+ 0xF0, 0xA8, 0x89, 0x9F, 0xE5, 0x91, 0x90, 0xE3, 0x97, 0x82,
+ 0xE8, 0xB6, 0x8A,
+ });
size_t offset = 0;
- auto ret = utf::read8(str, offset);
+ auto ret = utf::read8(data, offset);
EXPECT_EQ(0x2825F, ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ(0x5450, ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ(0x35C2, ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ(0x8D8A, ret);
- ret = utf::read8(str, offset);
+ ret = utf::read8(data, offset);
EXPECT_EQ(utf::NEED_MORE, ret);
- EXPECT_EQ(str.size(), offset);
+ EXPECT_EQ(data.size(), offset);
+}
+
+TEST(utf8, write_no_space) {
+ std::array<uint8_t, 10> data;
+ std::span<uint8_t> out(data);
+ size_t offset = 0;
+ EXPECT_FALSE(utf::write8('$', out.subspan(0, 0), offset));
+ EXPECT_EQ(0u, offset);
+
+ EXPECT_FALSE(utf::write8(0xa3, out.subspan(0, 1), offset));
+ EXPECT_EQ(0u, offset);
+ EXPECT_FALSE(utf::write8(0x418, out.subspan(0, 0), offset));
+ EXPECT_EQ(0u, offset);
+
+ EXPECT_FALSE(utf::write8(0x939, out.subspan(0, 2), offset));
+ EXPECT_EQ(0u, offset);
+ EXPECT_FALSE(utf::write8(0x20AC, out.subspan(0, 0), offset));
+ EXPECT_EQ(0u, offset);
+
+ EXPECT_FALSE(utf::write8(0x10348, out.subspan(0, 3), offset));
+ EXPECT_EQ(0u, offset);
}