summaryrefslogtreecommitdiff
path: root/sax/src
diff options
context:
space:
mode:
Diffstat (limited to 'sax/src')
-rw-r--r--sax/src/buffer.cc398
-rw-r--r--sax/src/buffer.hh108
-rw-r--r--sax/src/decoder.cc308
-rw-r--r--sax/src/guessing_decoder.cc92
-rw-r--r--sax/src/guessing_decoder.hh21
-rw-r--r--sax/src/sax_attributes.cc38
-rw-r--r--sax/src/sax_delegate.cc21
-rw-r--r--sax/src/sax_processor.cc1098
-rw-r--r--sax/src/utils.cc37
-rw-r--r--sax/src/utils.hh4
10 files changed, 1922 insertions, 203 deletions
diff --git a/sax/src/buffer.cc b/sax/src/buffer.cc
new file mode 100644
index 0000000..964865d
--- /dev/null
+++ b/sax/src/buffer.cc
@@ -0,0 +1,398 @@
+#include "buffer.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <limits>
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+class DynamicBuffer : public Buffer {
+ public:
+ DynamicBuffer(std::size_t default_size, std::size_t max_size)
+ : default_size_(std::min(default_size, max_size)), max_size_(max_size),
+ data_(std::make_unique_for_overwrite<uint8_t[]>(default_size_)),
+ size_(default_size_) {}
+
+ std::span<uint8_t> wspan(std::size_t need) override {
+ auto avail = size_ - (offset_ + fill_);
+ if (need > avail) {
+ if (max_size_ - fill_ < need) // Early exit if need is never possible
+ return {};
+ if (offset_ > 0) {
+ std::copy_n(data_.get() + offset_, fill_, data_.get());
+ offset_ = 0;
+ }
+ avail = size_ - fill_;
+ if (need > avail) {
+ auto const max = std::numeric_limits<std::size_t>::max() / 2;
+ std::size_t new_size = size_;
+ while (true) {
+ if (new_size <= max) {
+ new_size *= 2;
+ } else {
+ new_size = std::numeric_limits<std::size_t>::max();
+ }
+ if (new_size >= max_size_) {
+ new_size = max_size_;
+ break;
+ }
+ if (new_size - fill_ >= need)
+ break;
+ }
+ // Using new as it has std::nothrow which make_unique lacks.
+ // Easy enought to keep track of the pointers here anyway.
+ auto* tmp = new(std::nothrow) uint8_t[new_size];
+ if (tmp == nullptr)
+ return {};
+ std::copy_n(data_.get(), fill_, tmp);
+ size_ = new_size;
+ data_.reset(tmp);
+ }
+ }
+ return {data_.get() + offset_ + fill_, size_ - (offset_ + fill_)};
+ }
+
+ void commit(std::size_t size) override {
+ assert(size_ - (offset_ + fill_) >= size);
+ fill_ += size;
+ }
+
+ std::span<uint8_t const> rspan(std::size_t) override {
+ return {data_.get() + offset_, fill_};
+ }
+
+ void consume(std::size_t size) override {
+ if (size == 0)
+ return;
+ assert(fill_ >= size);
+ fill_ -= size;
+ if (fill_ == 0) {
+ reset();
+ } else {
+ offset_ += size;
+ }
+ }
+
+ std::span<uint8_t> mspan(std::size_t) override {
+ return {data_.get() + offset_, fill_};
+ }
+
+ std::size_t uncommit(std::size_t size) override {
+ auto ret = std::min(size, fill_);
+ fill_ -= ret;
+ if (fill_ == 0) {
+ reset();
+ }
+ return ret;
+ }
+
+ bool empty() const override {
+ return fill_ == 0;
+ }
+
+ bool full() const override {
+ return fill_ >= max_size_;
+ }
+
+ void reset() override {
+ if (size_ != default_size_)
+ data_ = std::make_unique_for_overwrite<uint8_t[]>(size_ = default_size_);
+ offset_ = 0;
+ fill_ = 0;
+ }
+
+ private:
+ std::size_t const default_size_;
+ std::size_t const max_size_;
+ std::unique_ptr<uint8_t[]> data_;
+ std::size_t size_;
+ std::size_t offset_{0};
+ std::size_t fill_{0};
+};
+
+class FixedBuffer : public Buffer {
+ public:
+ explicit FixedBuffer(std::size_t size)
+ : size_(size), data_(std::make_unique<uint8_t[]>(size_)) {}
+
+ std::span<uint8_t> wspan(std::size_t need) override {
+ auto avail = wavail();
+ if (need > avail) {
+ if (need > size_ - ravail()) // Early exit if need will never fit
+ return {};
+ if (rptr_ < wptr_ || (rptr_ == wptr_ && !full_)) {
+ rotate();
+ avail = wavail();
+ } else {
+ return {};
+ }
+ }
+ return {data_.get() + wptr_, avail};
+ }
+
+ void commit(std::size_t size) override {
+ if (size == 0)
+ return;
+ assert(wavail() >= size);
+ wptr_ += size;
+ if (wptr_ == size_)
+ wptr_ = 0;
+ if (rptr_ == wptr_)
+ full_ = true;
+ }
+
+ std::span<uint8_t const> rspan(std::size_t want) override {
+ return mspan(want);
+ }
+
+ void consume(std::size_t size) override {
+ if (size == 0)
+ return;
+ assert(ravail() >= size);
+ full_ = false;
+ rptr_ += size;
+ if (rptr_ == size_)
+ rptr_ = 0;
+ if (rptr_ == wptr_)
+ reset();
+ }
+
+ std::span<uint8_t> mspan(std::size_t want) override {
+ auto avail = ravail();
+ if (want > avail) {
+ if (rptr_ > wptr_ || (rptr_ == wptr_ && full_)) {
+ rotate();
+ avail = ravail();
+ }
+ }
+ return {data_.get() + rptr_, avail};
+ }
+
+ std::size_t uncommit(std::size_t size) override {
+ if (size == 0)
+ return 0;
+ auto ret = do_uncommit(size);
+ if (ret < size) {
+ ret += do_uncommit(size - ret);
+ }
+ return ret;
+ }
+
+ bool empty() const override {
+ return rptr_ == wptr_ && !full_;
+ }
+
+ bool full() const override {
+ return rptr_ == wptr_ && full_;
+ }
+
+ void reset() override {
+ rptr_ = 0;
+ wptr_ = 0;
+ full_ = false;
+ }
+
+ private:
+ std::size_t ravail() const {
+ if (rptr_ < wptr_)
+ return wptr_ - rptr_;
+ if (rptr_ == wptr_ && !full_)
+ return 0;
+ return size_ - rptr_;
+ }
+
+ std::size_t wavail() const {
+ if (rptr_ > wptr_)
+ return rptr_ - wptr_;
+ if (rptr_ == wptr_ && full_)
+ return 0;
+ return size_ - wptr_;
+ }
+
+ std::size_t do_uncommit(std::size_t size) {
+ if (size == 0 || (rptr_ == wptr_ && !full_))
+ return 0;
+
+ full_ = false;
+
+ if (wptr_ == 0)
+ wptr_ = size_;
+
+ auto avail = rptr_ < wptr_ ? wptr_ - rptr_ : wptr_;
+ avail = std::min(avail, size);
+ wptr_ -= avail;
+ return avail;
+ }
+
+ void rotate() {
+ assert(rptr_ > 0);
+
+ if (rptr_ < wptr_) {
+ std::copy(data_.get() + rptr_, data_.get() + wptr_, data_.get());
+ wptr_ -= rptr_;
+ rptr_ = 0;
+ } else if (wptr_ < rptr_ || (wptr_ == rptr_ && full_)) {
+ auto left = wptr_;
+ auto right = size_ - rptr_;
+ // TODO: Can we do this without allocations?
+ if (left <= right) {
+ auto tmp = std::make_unique<uint8_t[]>(left);
+ std::copy_n(data_.get(), left, tmp.get());
+ std::copy_n(data_.get() + rptr_, right, data_.get());
+ std::copy_n(tmp.get(), left, data_.get() + right);
+ } else {
+ auto tmp = std::make_unique<uint8_t[]>(right);
+ std::copy_n(data_.get() + rptr_, right, tmp.get());
+ std::copy_backward(data_.get(), data_.get() + left,
+ data_.get() + left + right - 1);
+ std::copy_n(tmp.get(), right, data_.get());
+ }
+ wptr_ = left + right;
+ if (wptr_ == size_)
+ wptr_ = 0;
+ rptr_ = 0;
+ } else {
+ assert(false);
+ }
+ }
+
+ std::size_t const size_;
+ std::unique_ptr<uint8_t[]> data_;
+ std::size_t rptr_{0};
+ std::size_t wptr_{0};
+ bool full_{false};
+};
+
+class ReadViewBufferImpl : public ReadViewBuffer {
+ public:
+ explicit ReadViewBufferImpl(std::unique_ptr<Buffer> buffer)
+ : buffer_(std::move(buffer)) {}
+
+ std::size_t consumed() const override {
+ return offset_;
+ }
+
+ std::unique_ptr<Buffer> release() override {
+ return std::move(buffer_);
+ }
+
+ std::span<uint8_t> wspan(std::size_t need) override {
+ return buffer_->wspan(need);
+ }
+
+ void commit(std::size_t size) override {
+ return buffer_->commit(size);
+ }
+
+ std::span<uint8_t const> rspan(std::size_t want) override {
+ auto ret = buffer_->rspan(offset_ + want);
+ if (ret.size() <= offset_)
+ return ret.subspan(0, 0);
+ return ret.subspan(offset_, ret.size() - offset_);
+ }
+
+ void consume(std::size_t size) override {
+ offset_ += size;
+ }
+
+ std::span<uint8_t> mspan(std::size_t want) override {
+ auto ret = buffer_->mspan(offset_ + want);
+ if (ret.size() <= offset_)
+ return ret.subspan(0, 0);
+ return ret.subspan(offset_, ret.size() - offset_);
+ }
+
+ std::size_t uncommit(std::size_t size) override {
+ return buffer_->uncommit(size);
+ }
+
+ bool empty() const override {
+ if (buffer_->empty())
+ return true;
+ auto data = buffer_->rspan(offset_ + 1);
+ return data.size() <= offset_;
+ }
+
+ bool full() const override {
+ return buffer_->full();
+ }
+
+ void reset() override {
+ offset_ = 0;
+ }
+
+ private:
+ std::unique_ptr<Buffer> buffer_;
+ std::size_t offset_{0};
+};
+
+} // namespace
+
+std::unique_ptr<Buffer> make_buffer(std::size_t default_size,
+ std::size_t max_size) {
+ if (default_size >= max_size)
+ return std::make_unique<FixedBuffer>(max_size);
+
+ return std::make_unique<DynamicBuffer>(default_size, max_size);
+}
+
+std::unique_ptr<ReadViewBuffer> make_read_view_buffer(
+ std::unique_ptr<Buffer> buffer) {
+ return std::make_unique<ReadViewBufferImpl>(std::move(buffer));
+}
+
+std::size_t Buffer::write(std::span<uint8_t const> data) {
+ std::size_t offset = 0;
+ while (offset < data.size()) {
+ auto target = wspan();
+ if (target.empty())
+ break;
+ auto size = std::min(data.size() - offset, target.size());
+ std::copy_n(data.data() + offset, size, target.data());
+ commit(size);
+ offset += size;
+ }
+ return offset;
+}
+
+bool Buffer::write_all(std::span<uint8_t const> data) {
+ if (data.empty())
+ return true;
+ auto target = wspan(data.size());
+ if (target.empty())
+ return false;
+ std::copy(data.begin(), data.end(), target.begin());
+ commit(data.size());
+ return true;
+}
+
+std::size_t Buffer::read(std::span<uint8_t> data) {
+ std::size_t offset = 0;
+ while (offset < data.size()) {
+ auto source = rspan();
+ if (source.empty())
+ break;
+ auto size = std::min(data.size() - offset, source.size());
+ std::copy_n(source.data(), size, data.data() + offset);
+ consume(size);
+ offset += size;
+ }
+ return offset;
+}
+
+bool Buffer::read_all(std::span<uint8_t> data) {
+ auto source = rspan(data.size());
+ if (source.size() < data.size())
+ return false;
+ std::copy_n(source.begin(), data.size(), data.begin());
+ consume(data.size());
+ return true;
+}
+
+} // namespace sax
+} // namespace modxml
+
diff --git a/sax/src/buffer.hh b/sax/src/buffer.hh
new file mode 100644
index 0000000..d9fb9fc
--- /dev/null
+++ b/sax/src/buffer.hh
@@ -0,0 +1,108 @@
+#ifndef BUFFER_HH
+#define BUFFER_HH
+
+#include "macros.hh"
+
+#include <memory>
+#include <span>
+
+namespace modxml {
+namespace sax {
+
+class HIDDEN Buffer {
+ public:
+ virtual ~Buffer() = default;
+
+ Buffer(Buffer const&) = delete;
+ Buffer& operator=(Buffer const&) = delete;
+
+ // Returns a writable span, either at least need large or in case
+ // the buffer is full, an empty span.
+ // Returned span is valid until any other method is called on the buffer.
+ virtual std::span<uint8_t> wspan(std::size_t need = 1) = 0;
+ // Commit size data from the last returned wspan. size must be <= span.size.
+ // Remember that the span is now invalid and you need to call wspan again
+ // to write more.
+ virtual void commit(std::size_t size) = 0;
+
+ // Returns a readable span of all readily available data in buffer.
+ // If there is enought data in the buffer to satisfy want, the returned
+ // span is at least as large.
+ // Returned span is valid until any other method is called on the buffer.
+ virtual std::span<uint8_t const> rspan(std::size_t want = 1) = 0;
+ // Consume size data from buffer. size must be <= span.size.
+ // Remember that the span is now invalid and you need to call rspan again
+ // to read more.
+ virtual void consume(std::size_t size) = 0;
+
+ // Returns the same span as rspan but this is writable, you can modify
+ // the content. You cannot change the size of the span.
+ // If you wish to append data, use wspan() + commit().
+ // If you wish to remove data, use uncommit().
+ // If you wish to insert you have to be clever.
+ // Returned span is valid until any other method is called on the buffer.
+ virtual std::span<uint8_t> mspan(std::size_t want = 1) = 0;
+
+ // Uncommit the last size bytes in the buffer. Returns the bytes
+ // removed. If you used wspan() + commit() to add ten (10) bytes say and then
+ // call uncommit() with a size of seven (7) the first three (3) bytes written
+ // will the left in the buffer.
+ virtual std::size_t uncommit(std::size_t size) = 0;
+
+ // Returns true if buffer is empty.
+ virtual bool empty() const = 0;
+
+ // Returns true if buffer is full. This means filled to max_size.
+ virtual bool full() const = 0;
+
+ // Clear buffer, reset back to initial state.
+ virtual void reset() = 0;
+
+ // Write as much as possible of data to buffer.
+ // Returns bytes written (may be zero).
+ std::size_t write(std::span<uint8_t const> data);
+
+ // Either write all of the data to buffer or none. Returns true if data was
+ // written or data was empty.
+ bool write_all(std::span<uint8_t const> data);
+
+ // Read as much as possible from buffer to data.
+ // Returns bytes read (may be zero).
+ std::size_t read(std::span<uint8_t> data);
+
+ // Either fill data with data from buffer or return false.
+ bool read_all(std::span<uint8_t> data);
+
+ protected:
+ Buffer() = default;
+};
+
+// Create a buffer. default_size is used as an hint but generally that
+// will be the initial size of the buffer. max_size is an hard limit.
+// max_size == 0 is valid but will return an always full and empty buffer.
+std::unique_ptr<Buffer> HIDDEN make_buffer(std::size_t default_size,
+ std::size_t max_size);
+
+class ReadViewBuffer : public Buffer {
+ public:
+ // Returns bytes consumed in this buffer.
+ virtual std::size_t consumed() const = 0;
+
+ // Take ownership back of the wrapped buffer from the read view.
+ // The read view is now unusable.
+ virtual std::unique_ptr<Buffer> release() = 0;
+
+ protected:
+ ReadViewBuffer() = default;
+};
+
+// Create a read view buffer. Writing will go to wrapped buffer. Reading
+// is done on the read view buffer without moving the wrapped buffers read
+// pointer. These views are lightweight.
+std::unique_ptr<ReadViewBuffer> HIDDEN make_read_view_buffer(
+ std::unique_ptr<Buffer> buffer);
+
+} // namespace sax
+} // namespace modxml
+
+#endif // BUFFER_HH
diff --git a/sax/src/decoder.cc b/sax/src/decoder.cc
index 30b1735..35b9b46 100644
--- a/sax/src/decoder.cc
+++ b/sax/src/decoder.cc
@@ -12,273 +12,233 @@ namespace sax {
namespace {
-class UtfDecoder : public Decoder {
+class KnownEndianDecoder : public Decoder {
public:
- State decode(std::string_view in, std::size_t& in_offset,
- uint32_t* out, std::size_t out_size,
- std::size_t& out_offset) override {
- std::size_t const out_start = out_offset;
+ State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+ std::span<uint8_t> out, std::size_t& out_offset) override {
+ std::size_t tmp = in_offset;
+ uint32_t ret = read(in, tmp);
+ if (ret == utf::NEED_MORE)
+ return State::NEED_MORE;
+ if (ret == utf::INVALID)
+ return State::INVALID;
+
if (bom_ == -1) UNLIKELY {
- std::size_t tmp = in_offset;
- uint32_t ret = read(in, tmp);
- if (ret == utf::NEED_MORE) {
- return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return State::INVALID;
- }
if (ret == 0xfeff) {
// To allow offset to advance and to return, we need to
// read at least one more character completely.
ret = read(in, tmp);
- if (ret == utf::NEED_MORE) {
+ if (ret == utf::NEED_MORE)
return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
+ if (ret == utf::INVALID)
return State::INVALID;
- }
bom_ = 1;
} else {
bom_ = 0;
}
- in_offset = tmp;
- out[out_offset++] = ret;
- if (out_offset == out_size)
- return State::GOOD;
+ if (!utf::write8(ret, out, out_offset)) {
+ bom_ = -1;
+ return State::NEED_MORE;
+ }
+ } else {
+ if (!utf::write8(ret, out, out_offset))
+ return State::NEED_MORE;
}
+ in_offset = tmp;
- do {
- uint32_t ret = read(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
- return State::GOOD;
+ while (true) {
+ ret = read(in, tmp);
+ if (ret == utf::NEED_MORE || ret == utf::INVALID)
+ return State::GOOD;
+ if (!utf::write8(ret, out, out_offset))
+ return State::GOOD;
+ in_offset = tmp;
+ }
}
protected:
- UtfDecoder() = default;
+ KnownEndianDecoder() = default;
- virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0;
+ virtual uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const = 0;
private:
int8_t bom_{-1};
};
-class Utf8Decoder : public UtfDecoder {
+class Utf8Decoder : public KnownEndianDecoder {
public:
Utf8Decoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read8(data, offset);
}
};
-class Utf16BeDecoder : public UtfDecoder {
+class Utf16BeDecoder : public KnownEndianDecoder {
public:
Utf16BeDecoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read16be(data, offset);
}
};
-class Utf16LeDecoder : public UtfDecoder {
+class Utf16LeDecoder : public KnownEndianDecoder {
public:
Utf16LeDecoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read16le(data, offset);
}
};
-class Utf32BeDecoder : public UtfDecoder {
+class Utf32BeDecoder : public KnownEndianDecoder {
public:
Utf32BeDecoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read32be(data, offset);
}
};
-class Utf32LeDecoder : public UtfDecoder {
+class Utf32LeDecoder : public KnownEndianDecoder {
public:
Utf32LeDecoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read32le(data, offset);
}
};
-class Utf16Decoder : public Decoder {
+class UnknownEndianDecoder : public Decoder {
public:
- Utf16Decoder() = default;
-
- State decode(std::string_view in, std::size_t& in_offset,
- uint32_t* out, std::size_t out_size,
- std::size_t& out_offset) override {
- std::size_t const out_start = out_offset;
+ State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+ std::span<uint8_t> out, std::size_t& out_offset) override {
+ std::size_t tmp = in_offset;
if (endian_ == -1) UNLIKELY {
- std::size_t tmp = in_offset;
- uint32_t ret = utf::read16be(in, tmp);
- int8_t endian;
- if (ret == utf::NEED_MORE) {
+ uint32_t ret = readbe(in, tmp);
+ if (ret == utf::NEED_MORE)
return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
+ if (ret == utf::INVALID)
return State::INVALID;
- }
if (ret == 0xfeff) {
- endian = 1; // Big endian
+ endian_ = 1;
} else if (ret == 0xfffe) {
- endian = 0; // Little endian
+ endian_ = 0;
} else {
return State::INVALID;
}
+ in_offset = tmp;
+ }
- // To allow offset to advance and to return, we need to
- // read at least one more character completely.
- ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp);
- if (ret == utf::NEED_MORE) {
+ if (endian_ == 0) {
+ uint32_t ret = readle(in, tmp);
+ if (ret == utf::NEED_MORE)
return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
+ if (ret == utf::INVALID)
return State::INVALID;
- }
+ if (!utf::write8(ret, out, out_offset))
+ return State::NEED_MORE;
+ in_offset = tmp;
- endian_ = endian;
+ while (true) {
+ ret = readle(in, tmp);
+ if (ret == utf::NEED_MORE || ret == utf::INVALID)
+ return State::GOOD;
+ if (!utf::write8(ret, out, out_offset))
+ return State::GOOD;
+ in_offset = tmp;
+ }
+ } else /* if (endian_ == 1) */ {
+ uint32_t ret = readbe(in, tmp);
+ if (ret == utf::NEED_MORE)
+ return State::NEED_MORE;
+ if (ret == utf::INVALID)
+ return State::INVALID;
+ if (!utf::write8(ret, out, out_offset))
+ return State::NEED_MORE;
in_offset = tmp;
- out[out_offset++] = ret;
- if (out_offset == out_size)
- return State::GOOD;
- }
- if (endian_ == 1) {
- do {
- uint32_t ret = utf::read16be(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
- } else {
- do {
- uint32_t ret = utf::read16le(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
+ while (true) {
+ ret = readbe(in, tmp);
+ if (ret == utf::NEED_MORE || ret == utf::INVALID)
+ return State::GOOD;
+ if (!utf::write8(ret, out, out_offset))
+ return State::GOOD;
+ in_offset = tmp;
+ }
}
- return State::GOOD;
}
+ protected:
+ UnknownEndianDecoder() = default;
+
+ virtual uint32_t readle(
+ std::span<uint8_t const> data, std::size_t& offset) const = 0;
+ virtual uint32_t readbe(
+ std::span<uint8_t const> data, std::size_t& offset) const = 0;
+
private:
int8_t endian_{-1};
};
-class Utf32Decoder : public Decoder {
+class Utf16Decoder : public UnknownEndianDecoder {
public:
- Utf32Decoder() = default;
+ Utf16Decoder() = default;
- State decode(std::string_view in, std::size_t& in_offset,
- uint32_t* out, std::size_t out_size,
- std::size_t& out_offset) override {
- std::size_t const out_start = out_offset;
- if (endian_ == -1) UNLIKELY {
- std::size_t tmp = in_offset;
- uint32_t ret = utf::read32be(in, tmp);
- int8_t endian;
- if (ret == utf::NEED_MORE) {
- return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- tmp = in_offset;
- ret = utf::read32le(in, tmp);
- if (ret == 0xfeff) {
- endian = 0; // Little endian
- } else {
- return State::INVALID;
- }
- } else if (ret == 0xfeff) {
- endian = 1; // Big endian
- } else {
- return State::INVALID;
- }
+ uint32_t readle(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
+ return utf::read16le(data, offset);
+ }
- // To allow offset to advance and to return, we need to
- // read the next character completely.
- ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp);
- if (ret == utf::NEED_MORE) {
- return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return State::INVALID;
- }
+ uint32_t readbe(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
+ return utf::read16be(data, offset);
+ }
+};
- endian_ = endian;
- in_offset = tmp;
- out[out_offset++] = ret;
- if (out_offset == out_size)
- return State::GOOD;
- }
+class Utf32Decoder : public UnknownEndianDecoder {
+ public:
+ Utf32Decoder() = default;
- if (endian_ == 1) {
- do {
- uint32_t ret = utf::read32be(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
- } else {
- do {
- uint32_t ret = utf::read32le(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
- }
- return State::GOOD;
+ uint32_t readle(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
+ return utf::read32le(data, offset);
}
- private:
- int8_t endian_{-1};
+ uint32_t readbe(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
+ return utf::read32be(data, offset);
+ }
};
class AsciiDecoder : public Decoder {
public:
AsciiDecoder() = default;
- State decode(std::string_view in, std::size_t& in_offset,
- uint32_t* out, std::size_t out_size,
- std::size_t& out_offset) override {
- std::size_t const out_start = out_offset;
- do {
- if (in_offset == in.size())
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- if (in[in_offset] & 0x80)
- return out_offset > out_start ? State::GOOD : State::INVALID;
- out[out_offset++] = in[in_offset++];
- } while (out_offset < out_size);
- return State::GOOD;
+ State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+ std::span<uint8_t> out, std::size_t& out_offset) override {
+ if (in_offset >= in.size())
+ return State::NEED_MORE;
+ if (in[in_offset] & 0x80)
+ return State::INVALID;
+ if (!utf::write8(in[in_offset], out, out_offset))
+ return State::NEED_MORE;
+ ++in_offset;
+
+ while (true) {
+ if (in_offset >= in.size() || in[in_offset] & 0x80)
+ return State::GOOD;
+ if (!utf::write8(in[in_offset], out, out_offset))
+ return State::GOOD;
+ ++in_offset;
+ }
}
};
diff --git a/sax/src/guessing_decoder.cc b/sax/src/guessing_decoder.cc
new file mode 100644
index 0000000..e72dab3
--- /dev/null
+++ b/sax/src/guessing_decoder.cc
@@ -0,0 +1,92 @@
+#include "guessing_decoder.hh"
+
+#include "decoder.hh"
+#include "sax_decoder.hh"
+#include "utf8.hh"
+#include "utf_error.hh"
+
+#include <cassert>
+
+using namespace std::string_view_literals;
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+bool eq(std::span<uint8_t const> a, std::size_t& a_offset, std::string_view b) {
+ if (a.size() - a_offset < b.size())
+ return false;
+ for (size_t i = 0; i < b.size(); ++i)
+ if (a[a_offset + i] != b[i])
+ return false;
+ return true;
+}
+
+class GuessingDecoder : public Decoder {
+ public:
+ State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+ std::span<uint8_t> out, std::size_t& out_offset) override {
+ assert(in_offset <= in.size());
+
+ if (!decided_) {
+ if (eq(in, in_offset, "\xef\xbb\xbf"sv)) {
+ decided_ = create_utf8_decoder();
+ } else if (eq(in, in_offset, "\xfe\xff\x00\x00"sv)) {
+ in_offset += 4;
+ decided_ = create_utf32be_decoder();
+ } else if (eq(in, in_offset, "\xfe\xff"sv)) {
+ // Could be UTF-32 BOM, need more data to decide
+ // (note, an xml document encoded in UTF-16 that is less than 4 bytes
+ // is rather impossible).
+ if (in.size() - in_offset < 4)
+ return State::NEED_MORE;
+ in_offset += 2;
+ decided_ = create_utf16be_decoder();
+ } else if (eq(in, in_offset, "\xff\xfe"sv)) {
+ in_offset += 2;
+ decided_ = create_utf16le_decoder();
+ } else if (eq(in, in_offset, "\x00\x00\xff\xfe"sv)) {
+ in_offset += 4;
+ decided_ = create_utf32le_decoder();
+ } else {
+ auto avail = in.size() - in_offset;
+ if (avail == 0)
+ return State::NEED_MORE;
+ if (avail >= 4 && in[in_offset] == 0 && in[in_offset + 1] == 0
+ && in[in_offset + 2] == 0 && in[in_offset + 3] != 0) {
+ decided_ = create_utf32le_decoder();
+ } else if (avail >= 4 && in[in_offset] != 0 && in[in_offset + 1] == 0
+ && in[in_offset + 2] == 0 && in[in_offset + 3] == 0) {
+ decided_ = create_utf32be_decoder();
+ } else if (avail >= 2 && in[in_offset] == 0 && in[in_offset + 1] != 0) {
+ decided_ = create_utf16le_decoder();
+ } else if (avail >= 2 && in[in_offset] != 0 && in[in_offset + 1] == 0) {
+ decided_ = create_utf16be_decoder();
+ } else {
+ auto tmp = in_offset;
+ auto ret = utf::read8(in, tmp);
+ if (ret == utf::NEED_MORE)
+ return State::NEED_MORE;
+ if (ret == utf::INVALID)
+ return State::INVALID;
+ // UTF-8 should be good enough to read the XML declaration.
+ decided_ = create_utf8_decoder();
+ }
+ }
+ }
+ return decided_->decode(in, in_offset, out, out_offset);
+ }
+
+ private:
+ std::unique_ptr<Decoder> decided_;
+};
+
+} // namespace
+
+std::unique_ptr<Decoder> create_guessing_decoder() {
+ return std::make_unique<GuessingDecoder>();
+}
+
+} // namespace sax
+} // namespace modxml
diff --git a/sax/src/guessing_decoder.hh b/sax/src/guessing_decoder.hh
new file mode 100644
index 0000000..0f42c3b
--- /dev/null
+++ b/sax/src/guessing_decoder.hh
@@ -0,0 +1,21 @@
+#ifndef GUESSING_DECODER_HH
+#define GUESSING_DECODER_HH
+
+#include "macros.hh"
+
+#include <memory>
+
+namespace modxml {
+namespace sax {
+
+class Decoder;
+
+// Decoder that tries to figure out, using BOM or just magic
+// what encoding is used, optimized for the first character to be
+// '<'.
+std::unique_ptr<Decoder> HIDDEN create_guessing_decoder();
+
+} // namespace sax
+} // namespace modxml
+
+#endif // GUESSING_DECODER_HH
diff --git a/sax/src/sax_attributes.cc b/sax/src/sax_attributes.cc
new file mode 100644
index 0000000..230c677
--- /dev/null
+++ b/sax/src/sax_attributes.cc
@@ -0,0 +1,38 @@
+#include "sax_attributes.hh"
+
+namespace modxml {
+namespace sax {
+
+Attribute::Attribute(std::string_view name, std::string_view value)
+ : name(name), value(value) {}
+
+std::optional<std::string_view> Attributes::find_first(std::string_view name)
+ const {
+ for (auto it = begin(); it != end(); ++it) {
+ if (it->name == name)
+ return it->value;
+ }
+ return std::nullopt;
+}
+
+std::optional<std::string_view> Attributes::find_last(std::string_view name)
+ const {
+ for (size_t i = size(); i > 0; --i) {
+ auto const& a = at(i - 1);
+ if (a.name == name)
+ return a.value;
+ }
+ return std::nullopt;
+}
+
+std::optional<std::size_t> Attributes::find(std::string_view name,
+ std::size_t index) const {
+ for (; index < size(); ++index) {
+ if (at(index).name == name)
+ return index;
+ }
+ return std::nullopt;
+}
+
+} // namespace sax
+} // namespace modxml
diff --git a/sax/src/sax_delegate.cc b/sax/src/sax_delegate.cc
new file mode 100644
index 0000000..2c2cfcd
--- /dev/null
+++ b/sax/src/sax_delegate.cc
@@ -0,0 +1,21 @@
+#include "sax_delegate.hh"
+
+namespace modxml {
+namespace sax {
+
+void Delegate::start_element(std::string_view, Attributes const&) {}
+
+void Delegate::empty_element(std::string_view, Attributes const&) {}
+
+void Delegate::end_element(std::string_view) {}
+
+void Delegate::character_data(std::string_view) {}
+
+void Delegate::processing_instruction(std::string_view, std::string_view) {}
+
+void Delegate::comment(std::string_view) {}
+
+void Delegate::error(std::string_view) {}
+
+} // namespace sax
+} // namespace modxml
diff --git a/sax/src/sax_processor.cc b/sax/src/sax_processor.cc
index ea9f753..afc9d3b 100644
--- a/sax/src/sax_processor.cc
+++ b/sax/src/sax_processor.cc
@@ -1,18 +1,41 @@
#include "sax_processor.hh"
-#include "sax_decoder.hh"
+#include <iostream>
+
+#include "buffer.hh"
+#include "guessing_decoder.hh"
#include "processor.hh"
+#include "sax_attributes.hh"
+#include "sax_decoder.hh"
+#include "sax_decoder_factory.hh"
+#include "sax_delegate.hh"
+#include "utf8.hh"
+#include "utf_error.hh"
#include "utils.hh"
#include <algorithm>
+#include <cassert>
+#include <charconv>
+#include <format>
+#include <map>
#include <optional>
#include <utility>
+#include <vector>
+
+using namespace std::string_view_literals;
namespace modxml {
namespace sax {
namespace {
+constexpr std::size_t kDefaultBufferSize = 8192;
+constexpr std::size_t kMinBufferSize = 128;
+
+inline bool is_digit(char c) {
+ return c >= '0' && c <= '9';
+}
+
// 2.2 Characters
// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
@@ -75,12 +98,185 @@ inline bool is_namechar(uint32_t c) {
(c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040);
}
-/* [5] Name ::= NameStartChar (NameChar)*
+/*
+[5] Name ::= NameStartChar (NameChar)*
[6] Names ::= Name (#x20 Name)*
[7] Nmtoken ::= (NameChar)+
[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
*/
+inline bool ascii_lowercase(char c) {
+ return (c >= 'A' & c <= 'Z') ? (c | 0x20) : c;
+}
+
+bool eq_lowercase(std::string_view a, std::string_view b) {
+ if (a.size() != b.size())
+ return false;
+ for (std::size_t i = 0; i < a.size(); ++i)
+ if (ascii_lowercase(a[i]) != b[i])
+ return false;
+ return true;
+}
+
+inline std::string_view make_string_view(std::span<uint8_t const> span) {
+ return std::string_view(reinterpret_cast<char const*>(span.data()),
+ span.size());
+}
+
+class Entities {
+ public:
+ Entities() {
+ data_.emplace("lt", "<");
+ data_.emplace("gt", ">");
+ data_.emplace("amp", "&");
+ data_.emplace("apos", "'");
+ data_.emplace("quot", "\"");
+ }
+
+ std::optional<std::string> get(std::string const& entity) const {
+ if (entity.empty())
+ return std::nullopt;
+ if (entity.front() == '#') {
+ if (entity.size() == 1)
+ return std::nullopt;
+ int base;
+ char const* start;
+ char const* end = entity.data() + entity.size();
+ if (entity[1] == 'x') {
+ start = entity.data() + 2;
+ base = 16;
+ } else {
+ start = entity.data() + 1;
+ base = 10;
+ }
+ uint32_t value;
+ auto [ptr, ec] = std::from_chars(start, end, value, base);
+ if (ec == std::errc() && ptr == end) {
+ uint8_t tmp[4];
+ std::size_t offset = 0;
+ utf::write8(value, tmp, offset);
+ return std::string(reinterpret_cast<char*>(tmp), offset);
+ }
+ }
+ auto it = data_.find(entity);
+ if (it == data_.end())
+ return std::nullopt;
+ return it->second;
+ }
+
+ private:
+ std::map<std::string, std::string> data_;
+};
+
+bool deamp(Entities const& entities, std::string& str, std::size_t last = 0) {
+ while (true) {
+ auto next = str.find('&', last);
+ if (next == std::string::npos)
+ break;
+ next += 1;
+ auto semicolon = str.find(';', next);
+ if (semicolon == std::string::npos)
+ return false;
+ auto replacement = entities.get(str.substr(next, semicolon - next));
+ if (!replacement.has_value())
+ return false;
+ }
+ return true;
+}
+
+std::optional<std::string> unquote(Entities const& entities,
+ std::string_view quoted) {
+ assert(quoted.size() >= 2);
+ assert(quoted.front() == quoted.back());
+ std::string ret(quoted.substr(1, quoted.size() - 2));
+ if (deamp(entities, ret))
+ return ret;
+ return std::nullopt;
+}
+
+std::optional<std::string_view> unquote_if_needed(Entities const& entities,
+ std::string_view quoted,
+ std::string& tmp) {
+ assert(quoted.size() >= 2);
+ assert(quoted.front() == quoted.back());
+ auto input = quoted.substr(1, quoted.size() - 2);
+ auto index = input.find('&');
+ if (index == std::string_view::npos)
+ return input;
+ tmp.assign(input);
+ if (deamp(entities, tmp, index))
+ return tmp;
+ return std::nullopt;
+}
+
+class AttributesImpl : public Attributes {
+ public:
+ AttributesImpl() = default;
+
+ bool init(Entities const& entities,
+ std::span<const uint8_t> data,
+ std::vector<size_t> const& offsets,
+ std::size_t first) {
+ std::size_t a = first;
+ attr_.reserve((offsets.size() - first) / 4);
+ while (a + 4 <= offsets.size()) {
+ auto name = make_string_view(data.subspan(offsets[a], offsets[a + 1]));
+ std::string tmp;
+ auto value = unquote_if_needed(
+ entities,
+ make_string_view(data.subspan(offsets[a + 2], offsets[a + 3])),
+ tmp);
+ if (!value.has_value())
+ return false;
+ if (tmp.empty()) {
+ attr_.emplace_back(name, *value);
+ } else {
+ attr_.emplace_back(name, *value, std::move(tmp));
+ }
+ a += 4;
+ }
+ return true;
+ }
+
+ iterator begin() const override {
+ return Iterator(this, 0);
+ }
+
+ iterator end() const override {
+ return Iterator(this, attr_.size());
+ }
+
+ std::size_t size() const override {
+ return attr_.size();
+ }
+
+ Attribute const& at(std::size_t index) const override {
+ return attr_[index];
+ }
+
+ private:
+ class Iterator : public iterator {
+ public:
+ Iterator(Attributes const* attributes, std::size_t index)
+ : iterator(attributes, index) {}
+ };
+
+ struct AttributeImpl : public Attribute {
+ AttributeImpl(std::string_view name, std::string_view value)
+ : Attribute(name, value) {}
+
+ AttributeImpl(std::string_view name, std::string_view value,
+ std::string&& tmp)
+ : Attribute(name, value), tmp_(std::move(tmp)) {}
+
+ private:
+ std::string tmp_;
+ };
+
+ std::span<const uint8_t> data_;
+ std::vector<AttributeImpl> attr_;
+};
+
class ProcessorImpl : public Processor {
public:
ProcessorImpl(std::shared_ptr<Delegate> delegate,
@@ -91,15 +287,898 @@ class ProcessorImpl : public Processor {
: delegate_(std::move(delegate)),
decoder_factory_(std::move(decoder_factory)),
decoder_(std::move(decoder)),
- default_buffer_size_(default_buffer_size),
- max_buffer_size_(max_buffer_size) {}
+ forced_decoder_(decoder_),
+ buffer_(make_buffer(default_buffer_size, max_buffer_size)) {
+ if (!decoder_)
+ decoder_ = create_guessing_decoder();
+
+ expect_document();
+ }
+
+ std::size_t process(std::span<uint8_t const> data,
+ std::size_t offset) override {
+ cmds_.emplace_back(Command::FILL_BUFFER, Count::ZERO_OR_ONE);
+
+ std::size_t consumed = 0;
+
+ while (true) {
+ if (cmds_.empty()) {
+ if (!buffer_->empty()) {
+ std::cerr << make_string_view(buffer_->rspan()) << std::endl;
+ delegate_->error("Extra data at end");
+ }
+ return consumed;
+ }
+
+ auto current = cmds_.back();
+ auto const old_size = cmds_.size();
+ cmds_.pop_back();
+ Process ret;
+ switch (current.command) {
+ case Command::FILL_BUFFER:
+ ret = fill_buffer(data, offset, consumed);
+ break;
+ case Command::MISC:
+ ret = process_misc(current);
+ break;
+ case Command::SPACE:
+ ret = process_space(current);
+ break;
+ case Command::ELEMENT:
+ ret = process_element(current);
+ break;
+ case Command::COMMENT:
+ ret = process_comment(current);
+ break;
+ case Command::PROCESSING_INSTRUCTION:
+ ret = process_processing_instruction(current);
+ break;
+ case Command::XMLDECL:
+ ret = process_xmldecl(current);
+ break;
+ case Command::ATTRIBUTE:
+ ret = process_attribute(current);
+ break;
+ case Command::NAME:
+ ret = process_name(current);
+ break;
+ case Command::ATTRIBUTE_VALUE:
+ ret = process_attribute_value(current);
+ break;
+ case Command::EQUAL:
+ ret = process_equal(current);
+ break;
+ case Command::START_OR_EMPTY_TAG:
+ ret = process_start_or_empty_tag(current);
+ break;
+ case Command::END_TAG:
+ ret = process_end_tag(current);
+ break;
+ }
+
+ switch (ret) {
+ case Process::NEED_MORE:
+ case Process::ERROR:
+ cmds_.push_back(current);
+ assert(cmds_.size() == old_size);
+ return consumed;
+ case Process::CONTINUE:
+ break;
+ }
+ }
+ }
+
+ uint64_t line() const override { return line_; }
+
+ uint64_t column() const override { return column_; }
private:
+ enum class Process {
+ NEED_MORE,
+ ERROR,
+ CONTINUE,
+ };
+
+ enum class Match {
+ FULL_MATCH,
+ PARTIAL_MATCH,
+ NO_MATCH,
+ };
+
+ enum class Command {
+ FILL_BUFFER,
+
+ ATTRIBUTE,
+ ATTRIBUTE_VALUE,
+ COMMENT,
+ ELEMENT,
+ END_TAG,
+ EQUAL,
+ MISC,
+ NAME,
+ PROCESSING_INSTRUCTION,
+ SPACE,
+ START_OR_EMPTY_TAG,
+ XMLDECL,
+ };
+
+ enum class Count {
+ ONE,
+ ONE_OR_MANY,
+ ZERO_OR_ONE,
+ ZERO_OR_MANY,
+ };
+
+ struct CommandItem {
+ Command const command;
+ Count const count;
+ std::size_t offset;
+
+ CommandItem(Command command, Count count, std::size_t offset = 0)
+ : command(command), count(count), offset(offset) {}
+ };
+
+ struct StackItem {
+ std::vector<std::size_t> offsets;
+ };
+
+ Process fill_buffer(std::span<uint8_t const> data,
+ std::size_t offset,
+ std::size_t& consumed) {
+ if (offset >= data.size())
+ return Process::NEED_MORE;
+
+ std::size_t tmp = offset;
+ auto wspan = buffer_->wspan(4);
+ switch (decoder_->decode(data, tmp, wspan, consumed)) {
+ case Decoder::State::GOOD:
+ break;
+ case Decoder::State::NEED_MORE:
+ return Process::NEED_MORE;
+ case Decoder::State::INVALID:
+ delegate_->error("Invalid data");
+ return Process::ERROR;
+ }
+ buffer_->commit(consumed);
+ return Process::CONTINUE;
+ }
+
+ void expect_document() {
+ // document := prolog element Misc*
+ expect_misc(Count::ZERO_OR_MANY);
+ expect_element(Count::ONE);
+ expect_prolog();
+ }
+
+ void expect_misc(Count count) {
+ cmds_.emplace_back(Command::MISC, count);
+ }
+
+ void expect_element(Count count) {
+ // element ::= EmptyElemTag | STag content ETag
+ cmds_.emplace_back(Command::START_OR_EMPTY_TAG, count);
+ }
+
+ void expect_end_tag(Count count) {
+ cmds_.emplace_back(Command::END_TAG, count);
+ }
+
+ void expect_prolog() {
+ // prolog := XMLDecl? Misc* (doctypedecl Misc*)?
+ expect_misc(Count::ZERO_OR_MANY);
+ expect_doctypedecl(Count::ZERO_OR_ONE);
+ expect_misc(Count::ZERO_OR_MANY);
+ expect_xmldecl(Count::ZERO_OR_ONE);
+ }
+
+ void expect_xmldecl(Count count) {
+ cmds_.emplace_back(Command::XMLDECL, count);
+ }
+
+ void expect_doctypedecl(Count) {
+ // TODO
+ }
+
+ void expect_comment(Count count, std::size_t start_offset = 0) {
+ // Comment should never be more than one, should be MISC that is repeated.
+ assert(count == Count::ONE);
+ cmds_.emplace_back(Command::COMMENT, count, start_offset);
+ }
+
+ void expect_content(Count) {
+ // TODO
+ }
+
+ void expect_pi(Count count, std::size_t start_offset = 0) {
+ // PI should never be more than one, should be MISC that is repeated.
+ assert(count == Count::ONE);
+ cmds_.emplace_back(Command::PROCESSING_INSTRUCTION, count, start_offset);
+ }
+
+ void expect_space(Count count) {
+ // There is not way to have SS as S is continous, so we should never
+ // ask for more than one or zero.
+ assert(count == Count::ZERO_OR_ONE || count == Count::ONE);
+ cmds_.emplace_back(Command::SPACE, count);
+ }
+
+ void expect_attribute(Count count) {
+ switch (count) {
+ case Count::ONE_OR_MANY:
+ cmds_.emplace_back(Command::ATTRIBUTE, Count::ZERO_OR_MANY);
+ case Count::ONE:
+ // Attribute ::= Name Eq AttValue
+ expect_attribute_value(Count::ONE);
+ expect_equal(Count::ONE);
+ expect_name(Count::ONE);
+ expect_space(Count::ONE);
+ break;
+ case Count::ZERO_OR_ONE:
+ case Count::ZERO_OR_MANY:
+ cmds_.emplace_back(Command::ATTRIBUTE, count);
+ break;
+ }
+ }
+
+ void expect_attribute_value(Count count) {
+ cmds_.emplace_back(Command::ATTRIBUTE_VALUE, count);
+ }
+
+ void expect_equal(Count count) {
+ // Eq ::= S? '=' S?
+ expect_space(Count::ZERO_OR_ONE);
+ cmds_.emplace_back(Command::EQUAL, count);
+ expect_space(Count::ZERO_OR_ONE);
+ }
+
+ void expect_name(Count count) {
+ cmds_.emplace_back(Command::NAME, count);
+ }
+
+ Process process_misc(CommandItem const& item) {
+ // Misc := Comment | PI | S
+ assert(item.offset == 0);
+
+ switch (match("<!--")) {
+ case Match::FULL_MATCH:
+ add_if_more(item);
+ expect_comment(Count::ONE, 3);
+ return Process::CONTINUE;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ break;
+ }
+
+ switch (match("<?")) {
+ case Match::FULL_MATCH:
+ add_if_more(item);
+ expect_pi(Count::ONE, 2);
+ return Process::CONTINUE;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ break;
+ }
+
+ switch (match_s()) {
+ case Match::FULL_MATCH:
+ add_if_more(item);
+ expect_space(Count::ONE);
+ return Process::CONTINUE;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ break;
+ }
+
+ return no_match(item);
+ }
+
+ Process process_attribute(CommandItem& item) {
+ // This actually parses (S Attribute)* when followed by S?
+ // for Attribute parsing see expect_attribute()
+ // So we need to figure out if the S means start of attribute
+ // or just an S. We do this by checking if the first non-S is
+ // a namestart or something else. We consume the S.
+ uint32_t last_char;
+ auto ret = consume_space(item.offset, last_char);
+ if (ret != Process::CONTINUE)
+ return ret;
+
+ // No S, cannot be followed by an attribute then.
+ if (item.offset == 0)
+ return no_match(item);
+
+ // First character after S isn't a valid first character of a name,
+ // cannot be followed by an attribute then.
+ if (!is_namestartchar(last_char))
+ return no_match(item);
+
+ expect_attribute_value(Count::ONE);
+ expect_equal(Count::ONE);
+ expect_name(Count::ONE);
+ return Process::CONTINUE;
+ }
+
+ Process process_equal(CommandItem const& item) {
+ // Eq ::= S? '=' S?
+ // Spacing added by expect_equal
+ switch (match_consume("=")) {
+ case Match::FULL_MATCH:
+ add_if_more(item);
+ return Process::CONTINUE;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ return no_match(item);
+ }
+ }
+
+ Process process_name(CommandItem& item) {
+ // Name ::= NameStartChar (NameChar)*
+ auto data = buffer_->rspan(item.offset + 4);
+ while (true) {
+ std::size_t tmp = item.offset;
+ auto c = utf::read8(data, tmp);
+ if (c == utf::NEED_MORE)
+ return Process::NEED_MORE;
+ if (c == utf::INVALID || !valid_char(c))
+ return invalid_char(data, tmp);
+ if (item.offset == 0) {
+ if (!is_namestartchar(c))
+ return no_match(item);
+ } else {
+ if (!is_namechar(c))
+ break;
+ }
+ item.offset = tmp;
+ }
+
+ assert(!stack_.empty());
+ auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get());
+ stack_.back().offsets.push_back(read_view->consumed());
+ stack_.back().offsets.push_back(item.offset);
+ buffer_->consume(item.offset);
+ return Process::CONTINUE;
+ }
+
+ Process process_attribute_value(CommandItem& item) {
+ // AttValue ::= '"' ([^<&"] | Reference)* '"'
+ // | "'" ([^<&'] | Reference)* "'"
+
+ uint32_t end_char;
+ auto data = buffer_->rspan(item.offset + 4);
+
+ if (item.offset == 0) {
+ std::size_t tmp = item.offset;
+ auto c = utf::read8(data, tmp);
+ if (c == utf::NEED_MORE)
+ return Process::NEED_MORE;
+ if (c == utf::INVALID || !valid_char(c))
+ return invalid_char(data, tmp);
+ if (c != '"' && c != '\'')
+ return no_match(item);
+ item.offset = tmp;
+ end_char = c;
+ } else {
+ assert(!data.empty());
+ end_char = data[0]; // ok as both " and ' are ASCII
+ }
+
+ while (true) {
+ auto c = utf::read8(data, item.offset);
+ if (c == utf::NEED_MORE)
+ return Process::NEED_MORE;
+ if (c == utf::INVALID || !valid_char(c))
+ return invalid_char(data, item.offset);
+ if (c == end_char)
+ break;
+ // TODO: Should we validate reference already here or do we let
+ // unquoute take care of that? As Reference can't contain end_char
+ // only checking for end_char is safe here.
+ }
+
+ assert(!stack_.empty());
+ auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get());
+ stack_.back().offsets.push_back(read_view->consumed());
+ stack_.back().offsets.push_back(item.offset);
+ buffer_->consume(item.offset);
+ return Process::CONTINUE;
+ }
+
+ Process process_comment(CommandItem& item) {
+ if (item.offset == 0) {
+ switch (match_consume("<!--")) {
+ case Match::FULL_MATCH:
+ item.offset += 3;
+ break;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ return no_match(item);
+ }
+ }
+
+ auto match = find("-->", item.offset);
+ switch (match) {
+ case Match::FULL_MATCH: {
+ auto data = buffer_->rspan(item.offset);
+ assert(data.size() >= item.offset);
+ delegate_->comment(
+ make_string_view(data.subspan(3, item.offset - 3)));
+ buffer_->consume(item.offset + 3);
+ return Process::CONTINUE;
+ }
+ case Match::NO_MATCH:
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ }
+ }
+
+ Process process_processing_instruction(CommandItem& item) {
+ if (item.offset == 0) {
+ switch (match_consume("<?")) {
+ case Match::FULL_MATCH:
+ item.offset += 2;
+ break;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ return no_match(item);
+ }
+ }
+
+ // TODO
+ delegate_->error("PI not supported");
+ return Process::ERROR;
+ }
+
+ void add_to_stack(CommandItem const& item, std::size_t offset) {
+ cmds_.emplace_back(item.command, item.count, offset);
+ stack_.emplace_back();
+ buffer_ = make_read_view_buffer(std::move(buffer_));
+ buffer_->consume(offset);
+ }
+
+ std::size_t pop_stack(std::vector<std::size_t>& attr) {
+ assert(!stack_.empty());
+ std::swap(attr, stack_.back().offsets);
+
+ auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get());
+ auto consumed = read_view->consumed();
+
+ buffer_ = read_view->release();
+ stack_.pop_back();
+
+ return consumed;
+ }
+
+ Process process_xmldecl(CommandItem const& item) {
+ // XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
+ if (item.offset == 0) {
+ switch (match("<?xml")) {
+ case Match::FULL_MATCH:
+ add_to_stack(item, /* offset */ 5);
+ expect_space(Count::ZERO_OR_ONE);
+ // Parsing as generic "Attribute" here and doing validation later.
+ expect_attribute(Count::ONE_OR_MANY);
+ return Process::CONTINUE;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ return no_match(item);
+ }
+ }
+
+ assert(item.offset == 5);
+
+ // Remember that this is still reading for the read view buffer.
+ switch (match_consume("?>")) {
+ case Match::FULL_MATCH:
+ break;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ delegate_->error(std::format("Expected end of {}",
+ command_name(item.command)));
+ return Process::ERROR;
+ }
+
+ std::vector<std::size_t> attr;
+ auto const consumed = pop_stack(attr);
+
+ // Now we're back to the real buffer
+ auto data = buffer_->rspan(consumed);
+ std::size_t a = 0;
+
+ if (a + 4 <= attr.size() &&
+ make_string_view(data.subspan(attr[a + 0],
+ attr[a + 1])) == "version") {
+ auto version = make_string_view(data.subspan(attr[a + 2] + 1,
+ attr[a + 3] - 2));
+ if (!valid_version(version)) {
+ delegate_->error(std::format("Unsupported xmldecl version, {}",
+ version));
+ return Process::ERROR;
+ }
+ a += 4;
+ } else {
+ // No version
+ delegate_->error("Invalid xmldecl, must have a version attribute first.");
+ return Process::ERROR;
+ }
+
+ if (a + 4 <= attr.size() &&
+ make_string_view(data.subspan(attr[a + 0],
+ attr[a + 1])) == "encoding") {
+ auto encoding = make_string_view(data.subspan(attr[a + 2] + 1,
+ attr[a + 3] - 2));
+ if (forced_decoder_) {
+ // encoding value is ignored
+ // TODO: Should we check that it is valid anyway?
+ } else {
+ auto decoder = pick_decoder_for_encoding(encoding, nullptr);
+ if (!decoder && decoder_factory_)
+ decoder = decoder_factory_->create(encoding);
+ if (!decoder) {
+ delegate_->error(std::format("Unknown encoding {}", encoding));
+ return Process::ERROR;
+ }
+ std::swap(decoder_, decoder);
+ // TODO: Re-decode the rest of the buffer?
+ }
+ a += 4;
+ }
+
+ if (a + 4 <= attr.size() &&
+ make_string_view(data.subspan(attr[a + 0],
+ attr[a + 1])) == "standalone") {
+ auto sd = make_string_view(data.subspan(attr[a + 2] + 1,
+ attr[a + 3] - 2));
+ if (sd == "yes") {
+ // TODO: Handle standalone == yes
+ } else if (sd == "no") {
+ // TODO: Handle standalone == no
+ } else {
+ delegate_->error(std::format(
+ "Invalid xmldecl, standalone attribute has unsupported value, {}",
+ sd));
+ return Process::ERROR;
+ }
+ a += 4;
+ }
+
+ if (a < attr.size()) {
+ delegate_->error(
+ std::format("Invalid xmldecl, unknown attribute, {}",
+ make_string_view(data.subspan(attr[a + 0],
+ attr[a + 1]))));
+ return Process::ERROR;
+ }
+
+ buffer_->consume(consumed);
+ return Process::CONTINUE;
+ }
+
+ Process process_start_or_empty_tag(CommandItem const& item) {
+ // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
+ // STag ::= '<' Name (S Attribute)* S? '>'
+ if (item.offset == 0) {
+ switch (match("<")) {
+ case Match::FULL_MATCH:
+ add_to_stack(item, /* offset */ 1);
+ expect_space(Count::ZERO_OR_ONE);
+ expect_attribute(Count::ZERO_OR_MANY);
+ expect_name(Count::ONE);
+ return Process::CONTINUE;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ return no_match(item);
+ }
+ }
+
+ assert(item.offset == 1);
+
+ bool empty_tag;
+
+ // Remember that this is still reading for the read view buffer.
+ switch (match_consume("/>")) {
+ case Match::FULL_MATCH:
+ empty_tag = true;
+ break;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ switch (match_consume(">")) {
+ case Match::FULL_MATCH:
+ empty_tag = false;
+ break;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ delegate_->error(std::format("Expected end of {}",
+ command_name(item.command)));
+ return Process::ERROR;
+ }
+ break;
+ }
+
+ std::vector<std::size_t> attr;
+ auto const consumed = pop_stack(attr);
+
+ // Now we're back to the real buffer
+ auto data = buffer_->rspan(consumed);
+
+ assert(attr.size() >= 2);
+ auto name = make_string_view(data.subspan(attr[0], attr[1]));
+
+ AttributesImpl attributes;
+ if (!attributes.init(entities_, data, std::move(attr), 2)) {
+ delegate_->error("Invalid references in attribute values");
+ return Process::ERROR;
+ }
+
+ add_if_more(item);
+
+ if (empty_tag) {
+ delegate_->empty_element(name, attributes);
+ } else {
+ delegate_->start_element(name, attributes);
+ expect_end_tag(Count::ONE);
+ expect_content(Count::ONE);
+ }
+
+ buffer_->consume(consumed);
+ return Process::CONTINUE;
+ }
+
+ Process process_end_tag(CommandItem const& item) {
+ // ETag ::= '</' Name S? '>'
+ if (item.offset == 0) {
+ switch (match("</")) {
+ case Match::FULL_MATCH:
+ add_to_stack(item, /* offset */ 2);
+ expect_space(Count::ZERO_OR_ONE);
+ expect_name(Count::ONE);
+ return Process::CONTINUE;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ return no_match(item);
+ }
+ }
+
+ assert(item.offset == 1);
+
+ // Remember that this is still reading for the read view buffer.
+ switch (match_consume(">")) {
+ case Match::FULL_MATCH:
+ break;
+ case Match::PARTIAL_MATCH:
+ return Process::NEED_MORE;
+ case Match::NO_MATCH:
+ delegate_->error(std::format("Expected end of {}",
+ command_name(item.command)));
+ return Process::ERROR;
+ }
+
+ std::vector<std::size_t> attr;
+ auto const consumed = pop_stack(attr);
+
+ // Now we're back to the real buffer
+ auto data = buffer_->rspan(consumed);
+
+ assert(attr.size() == 2);
+ auto name = make_string_view(data.subspan(attr[0], attr[1]));
+
+ add_if_more(item);
+
+ delegate_->end_element(name);
+
+ buffer_->consume(consumed);
+ return Process::CONTINUE;
+ }
+
+ static bool valid_version(std::string_view version) {
+ if (version.size() < 3)
+ return false;
+ if (!version.starts_with("1."))
+ return false;
+ for (std::size_t i = 2; i < version.size(); ++i) {
+ if (!is_digit(version[i]))
+ return false;
+ }
+ return true;
+ }
+
+ Process process_element(CommandItem& item) {
+ // TODO
+ delegate_->error("Element is not yet supported");
+ return Process::ERROR;
+ }
+
+ Process consume_space(std::size_t& count, uint32_t& last_char) {
+ auto data = buffer_->rspan(4);
+ std::size_t consumed = 0;
+ while (true) {
+ std::size_t offset = consumed;
+ auto c = utf::read8(data, offset);
+ if (c == utf::NEED_MORE) {
+ buffer_->consume(consumed);
+ return Process::NEED_MORE;
+ }
+ if (c == utf::INVALID || !valid_char(c))
+ return invalid_char(data, offset);
+ if (!is_ws(c)) {
+ last_char = c;
+ buffer_->consume(consumed);
+ return Process::CONTINUE;
+ }
+ ++count;
+ handle_ws(c);
+ consumed = offset;
+ }
+ }
+
+ Process process_space(CommandItem& item) {
+ // S ::= (#x20 | #x9 | #xD | #xA)+
+ // item.offset is only used to count spaces. We consume each space as it
+ // is found so no offset in buffer.
+ uint32_t unused;
+ auto ret = consume_space(item.offset, unused);
+ if (ret != Process::CONTINUE)
+ return ret;
+
+ if (item.offset == 0)
+ return no_match(item);
+
+ add_if_more(item);
+ return Process::CONTINUE;
+ }
+
+ void add_if_more(CommandItem const& item) {
+ switch (item.count) {
+ case Count::ONE:
+ break;
+ case Count::ONE_OR_MANY:
+ cmds_.emplace_back(item.command, Count::ZERO_OR_MANY);
+ break;
+ case Count::ZERO_OR_ONE:
+ break;
+ case Count::ZERO_OR_MANY:
+ cmds_.emplace_back(item.command, item.count);
+ }
+ }
+
+ Match find(std::string_view str, std::size_t& offset) {
+ auto data = buffer_->rspan(offset + str.size());
+ std::size_t i = 0;
+ while (offset < data.size()) {
+ if (str[i] == data[offset]) {
+ ++i;
+ if (i == str.size()) {
+ offset -= i;
+ return Match::FULL_MATCH;
+ }
+ } else {
+ i = 0;
+ }
+ ++offset;
+ }
+ if (i > 0) {
+ offset -= i;
+ return Match::PARTIAL_MATCH;
+ }
+ return Match::NO_MATCH;
+ }
+
+ Match match(std::string_view str, std::size_t offset = 0) {
+ auto data = buffer_->rspan(offset + str.size());
+ if (data.size() <= offset)
+ return Match::PARTIAL_MATCH;
+ auto const avail = std::min(str.size(), data.size() - offset);
+ for (std::size_t i = 0; i < avail; ++i) {
+ if (str[i] != data[offset + i])
+ return Match::NO_MATCH;
+ }
+ if (avail < str.size())
+ return Match::PARTIAL_MATCH;
+ return Match::FULL_MATCH;
+ }
+
+ Match match_consume(std::string_view str) {
+ auto ret = match(str);
+ if (ret == Match::FULL_MATCH)
+ buffer_->consume(str.size());
+ return ret;
+ }
+
+ Match match_s() {
+ auto data = buffer_->rspan(4);
+ std::size_t offset = 0;
+ auto c = utf::read8(data, offset);
+ if (c == utf::NEED_MORE)
+ return data.size() == 0 ? Match::PARTIAL_MATCH : Match::NO_MATCH;
+ if (c == utf::INVALID)
+ return Match::NO_MATCH;
+ if (!valid_char(c) || !is_ws(c))
+ return Match::NO_MATCH;
+ return Match::FULL_MATCH;
+ }
+
+ Process no_match(CommandItem const& item) {
+ switch (item.count) {
+ case Count::ONE:
+ case Count::ONE_OR_MANY:
+ delegate_->error(std::format("Expected {}",
+ command_name(item.command)));
+ return Process::ERROR;
+ case Count::ZERO_OR_ONE:
+ case Count::ZERO_OR_MANY:
+ break;
+ }
+ return Process::CONTINUE;
+ }
+
+ void handle_ws(uint32_t c) {
+ if (c == '\n') {
+ ++line_;
+ column_ = 0;
+ } else {
+ ++column_;
+ }
+ }
+
+ Process invalid_char(std::span<uint8_t const> data, std::size_t offset) {
+ delegate_->error(std::format("Invalid char {:02x}", data[offset]));
+ return Process::ERROR;
+ }
+
+ static std::string_view command_name(Command command) {
+ switch (command) {
+ case Command::MISC:
+ return "misc"sv;
+ case Command::FILL_BUFFER:
+ return "more data"sv;
+ case Command::ELEMENT:
+ return "element"sv;
+ case Command::SPACE:
+ return "whitespace"sv;
+ case Command::COMMENT:
+ return "comment"sv;
+ case Command::PROCESSING_INSTRUCTION:
+ return "processing instruction"sv;
+ case Command::XMLDECL:
+ return "xml declaration"sv;
+ case Command::ATTRIBUTE:
+ return "attribute"sv;
+ case Command::ATTRIBUTE_VALUE:
+ return "attribute value"sv;
+ case Command::NAME:
+ return "name"sv;
+ case Command::EQUAL:
+ return "equal sign (=)"sv;
+ case Command::START_OR_EMPTY_TAG:
+ return "element"sv;
+ case Command::END_TAG:
+ return "end tag"sv;
+ }
+ assert(false);
+ return {};
+ }
+
std::shared_ptr<Delegate> delegate_;
std::shared_ptr<DecoderFactory> decoder_factory_;
std::unique_ptr<Decoder> decoder_;
- std::size_t default_buffer_size_;
- std::size_t max_buffer_size_;
+ bool const forced_decoder_;
+ std::unique_ptr<Buffer> buffer_;
+ Entities entities_;
+ std::vector<CommandItem> cmds_;
+ std::vector<StackItem> stack_;
+ uint64_t line_{1};
+ uint64_t column_{0};
};
} // namespace
@@ -117,9 +1196,9 @@ std::unique_ptr<Processor> create_processor(
decoder_factory.get());
}
- std::size_t default_buffer_size = 8192;
+ std::size_t default_buffer_size = kDefaultBufferSize;
if (opt_default_buffer_size.has_value())
- default_buffer_size = std::max(static_cast<std::size_t>(128),
+ default_buffer_size = std::max(kMinBufferSize,
opt_default_buffer_size.value());
// This value is documented in public headers. Do NOT change.
std::size_t max_buffer_size = 10 * 1024 * 1024;
@@ -136,7 +1215,8 @@ std::unique_ptr<Processor> create_processor(
max_buffer_size);
}
-std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate) {
+std::unique_ptr<Processor>
+Processor::create(std::shared_ptr<Delegate> delegate) {
return create_processor(std::move(delegate), nullptr,
std::nullopt, std::nullopt, std::nullopt);
}
diff --git a/sax/src/utils.cc b/sax/src/utils.cc
index f0366d5..e3a53b1 100644
--- a/sax/src/utils.cc
+++ b/sax/src/utils.cc
@@ -9,7 +9,7 @@ namespace sax {
namespace {
-std::string cleanup_encoding(std::string const& str) {
+std::string cleanup_encoding(std::string_view str) {
std::string ret;
ret.reserve(str.size());
for (auto c : str) {
@@ -29,29 +29,29 @@ std::string cleanup_encoding(std::string const& str) {
// Names inspired by:
// https://www.iana.org/assignments/character-sets/character-sets.xhtml
std::unique_ptr<Decoder> pick_decoder_for_encoding(
- std::string const& encoding, DecoderFactory* factory) {
+ std::string_view encoding, DecoderFactory* factory) {
auto clean_enc = cleanup_encoding(encoding);
- if (clean_enc == "utf-8" || clean_enc == "utf8") {
+ if (clean_enc == "utf-8" || clean_enc == "utf8")
return create_utf8_decoder();
- }
- if (clean_enc == "utf-16" || clean_enc == "utf16") {
+
+ if (clean_enc == "utf-16" || clean_enc == "utf16")
return create_utf16_decoder();
- }
- if (clean_enc == "utf-16be" || clean_enc == "utf16be") {
+
+ if (clean_enc == "utf-16be" || clean_enc == "utf16be")
return create_utf16be_decoder();
- }
- if (clean_enc == "utf-16le" || clean_enc == "utf16le") {
+
+ if (clean_enc == "utf-16le" || clean_enc == "utf16le")
return create_utf16le_decoder();
- }
- if (clean_enc == "utf-32" || clean_enc == "utf32") {
+
+ if (clean_enc == "utf-32" || clean_enc == "utf32")
return create_utf32_decoder();
- }
- if (clean_enc == "utf-32be" || clean_enc == "utf32be") {
+
+ if (clean_enc == "utf-32be" || clean_enc == "utf32be")
return create_utf32be_decoder();
- }
- if (clean_enc == "utf-32le" || clean_enc == "utf32le") {
+
+ if (clean_enc == "utf-32le" || clean_enc == "utf32le")
return create_utf32le_decoder();
- }
+
if (clean_enc == "ascii" || clean_enc == "us-ascii" ||
clean_enc == "usascii" || clean_enc == "iso-ir-6" ||
clean_enc == "ansi-x3-4-1968" || clean_enc == "ansi-x3-4-1986" ||
@@ -59,9 +59,10 @@ std::unique_ptr<Decoder> pick_decoder_for_encoding(
clean_enc == "us" || clean_enc == "ibm367" || clean_enc == "cp367") {
return create_ascii_decoder();
}
- if (factory) {
+
+ if (factory)
return factory->create(encoding);
- }
+
return nullptr;
}
diff --git a/sax/src/utils.hh b/sax/src/utils.hh
index 206d003..074f0c0 100644
--- a/sax/src/utils.hh
+++ b/sax/src/utils.hh
@@ -4,7 +4,7 @@
#include "macros.hh"
#include <memory>
-#include <string>
+#include <string_view>
namespace modxml {
namespace sax {
@@ -13,7 +13,7 @@ class Decoder;
class DecoderFactory;
std::unique_ptr<Decoder> HIDDEN pick_decoder_for_encoding(
- std::string const& encoding,
+ std::string_view encoding,
DecoderFactory* factory);
} // namespace sax