10 files changed, 1922 insertions, 203 deletions
diff --git a/sax/src/buffer.cc b/sax/src/buffer.cc
new file mode 100644
index 0000000..964865d
--- /dev/null
+++ b/sax/src/buffer.cc
@@ -0,0 +1,398 @@
+#include "buffer.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <limits>
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+class DynamicBuffer : public Buffer {
+ public:
+  DynamicBuffer(std::size_t default_size, std::size_t max_size)
+      : default_size_(std::min(default_size, max_size)), max_size_(max_size),
+        data_(std::make_unique_for_overwrite<uint8_t[]>(default_size_)),
+        size_(default_size_) {}
+
+  std::span<uint8_t> wspan(std::size_t need) override {
+    auto avail = size_ - (offset_ + fill_);
+    if (need > avail) {
+      if (max_size_ - fill_ < need) // Early exit if need is never possible
+        return {};
+      if (offset_ > 0) {
+        std::copy_n(data_.get() + offset_, fill_, data_.get());
+        offset_ = 0;
+      }
+      avail = size_ - fill_;
+      if (need > avail) {
+        auto const max = std::numeric_limits<std::size_t>::max() / 2;
+        std::size_t new_size = size_;
+        while (true) {
+          if (new_size <= max) {
+            new_size *= 2;
+          } else {
+            new_size = std::numeric_limits<std::size_t>::max();
+          }
+          if (new_size >= max_size_) {
+            new_size = max_size_;
+            break;
+          }
+          if (new_size - fill_ >= need)
+            break;
+        }
+        // Using new as it has std::nothrow which make_unique lacks.
+        // Easy enought to keep track of the pointers here anyway.
+        auto* tmp = new(std::nothrow) uint8_t[new_size];
+        if (tmp == nullptr)
+          return {};
+        std::copy_n(data_.get(), fill_, tmp);
+        size_ = new_size;
+        data_.reset(tmp);
+      }
+    }
+    return {data_.get() + offset_ + fill_, size_ - (offset_ + fill_)};
+  }
+
+  void commit(std::size_t size) override {
+    assert(size_ - (offset_ + fill_) >= size);
+    fill_ += size;
+  }
+
+  std::span<uint8_t const> rspan(std::size_t) override {
+    return {data_.get() + offset_, fill_};
+  }
+
+  void consume(std::size_t size) override {
+    if (size == 0)
+      return;
+    assert(fill_ >= size);
+    fill_ -= size;
+    if (fill_ == 0) {
+      reset();
+    } else {
+      offset_ += size;
+    }
+  }
+
+  std::span<uint8_t> mspan(std::size_t) override {
+    return {data_.get() + offset_, fill_};
+  }
+
+  std::size_t uncommit(std::size_t size) override {
+    auto ret = std::min(size, fill_);
+    fill_ -= ret;
+    if (fill_ == 0) {
+      reset();
+    }
+    return ret;
+  }
+
+  bool empty() const override {
+    return fill_ == 0;
+  }
+
+  bool full() const override {
+    return fill_ >= max_size_;
+  }
+
+  void reset() override {
+    if (size_ != default_size_)
+      data_ = std::make_unique_for_overwrite<uint8_t[]>(size_ = default_size_);
+    offset_ = 0;
+    fill_ = 0;
+  }
+
+ private:
+  std::size_t const default_size_;
+  std::size_t const max_size_;
+  std::unique_ptr<uint8_t[]> data_;
+  std::size_t size_;
+  std::size_t offset_{0};
+  std::size_t fill_{0};
+};
+
+class FixedBuffer : public Buffer {
+ public:
+  explicit FixedBuffer(std::size_t size)
+      : size_(size), data_(std::make_unique<uint8_t[]>(size_)) {}
+
+  std::span<uint8_t> wspan(std::size_t need) override {
+    auto avail = wavail();
+    if (need > avail) {
+      if (need > size_ - ravail())  // Early exit if need will never fit
+        return {};
+      if (rptr_ < wptr_ || (rptr_ == wptr_ && !full_)) {
+        rotate();
+        avail = wavail();
+      } else {
+        return {};
+      }
+    }
+    return {data_.get() + wptr_, avail};
+  }
+
+  void commit(std::size_t size) override {
+    if (size == 0)
+      return;
+    assert(wavail() >= size);
+    wptr_ += size;
+    if (wptr_ == size_)
+      wptr_ = 0;
+    if (rptr_ == wptr_)
+      full_ = true;
+  }
+
+  std::span<uint8_t const> rspan(std::size_t want) override {
+    return mspan(want);
+  }
+
+  void consume(std::size_t size) override {
+    if (size == 0)
+      return;
+    assert(ravail() >= size);
+    full_ = false;
+    rptr_ += size;
+    if (rptr_ == size_)
+      rptr_ = 0;
+    if (rptr_ == wptr_)
+      reset();
+  }
+
+  std::span<uint8_t> mspan(std::size_t want) override {
+    auto avail = ravail();
+    if (want > avail) {
+      if (rptr_ > wptr_ || (rptr_ == wptr_ && full_)) {
+        rotate();
+        avail = ravail();
+      }
+    }
+    return {data_.get() + rptr_, avail};
+  }
+
+  std::size_t uncommit(std::size_t size) override {
+    if (size == 0)
+      return 0;
+    auto ret = do_uncommit(size);
+    if (ret < size) {
+      ret += do_uncommit(size - ret);
+    }
+    return ret;
+  }
+
+  bool empty() const override {
+    return rptr_ == wptr_ && !full_;
+  }
+
+  bool full() const override {
+    return rptr_ == wptr_ && full_;
+  }
+
+  void reset() override {
+    rptr_ = 0;
+    wptr_ = 0;
+    full_ = false;
+  }
+
+ private:
+  std::size_t ravail() const {
+    if (rptr_ < wptr_)
+      return wptr_ - rptr_;
+    if (rptr_ == wptr_ && !full_)
+      return 0;
+    return size_ - rptr_;
+  }
+
+  std::size_t wavail() const {
+    if (rptr_ > wptr_)
+      return rptr_ - wptr_;
+    if (rptr_ == wptr_ && full_)
+      return 0;
+    return size_ - wptr_;
+  }
+
+  std::size_t do_uncommit(std::size_t size) {
+    if (size == 0 || (rptr_ == wptr_ && !full_))
+      return 0;
+
+    full_ = false;
+
+    if (wptr_ == 0)
+      wptr_ = size_;
+
+    auto avail = rptr_ < wptr_ ? wptr_ - rptr_ : wptr_;
+    avail = std::min(avail, size);
+    wptr_ -= avail;
+    return avail;
+  }
+
+  void rotate() {
+    assert(rptr_ > 0);
+
+    if (rptr_ < wptr_) {
+      std::copy(data_.get() + rptr_, data_.get() + wptr_, data_.get());
+      wptr_ -= rptr_;
+      rptr_ = 0;
+    } else if (wptr_ < rptr_ || (wptr_ == rptr_ && full_)) {
+      auto left = wptr_;
+      auto right = size_ - rptr_;
+      // TODO: Can we do this without allocations?
+      if (left <= right) {
+        auto tmp = std::make_unique<uint8_t[]>(left);
+        std::copy_n(data_.get(), left, tmp.get());
+        std::copy_n(data_.get() + rptr_, right, data_.get());
+        std::copy_n(tmp.get(), left, data_.get() + right);
+      } else {
+        auto tmp = std::make_unique<uint8_t[]>(right);
+        std::copy_n(data_.get() + rptr_, right, tmp.get());
+        std::copy_backward(data_.get(), data_.get() + left,
+                           data_.get() + left + right - 1);
+        std::copy_n(tmp.get(), right, data_.get());
+      }
+      wptr_ = left + right;
+      if (wptr_ == size_)
+        wptr_ = 0;
+      rptr_ = 0;
+    } else {
+      assert(false);
+    }
+  }
+
+  std::size_t const size_;
+  std::unique_ptr<uint8_t[]> data_;
+  std::size_t rptr_{0};
+  std::size_t wptr_{0};
+  bool full_{false};
+};
+
+class ReadViewBufferImpl : public ReadViewBuffer {
+ public:
+  explicit ReadViewBufferImpl(std::unique_ptr<Buffer> buffer)
+      : buffer_(std::move(buffer)) {}
+
+  std::size_t consumed() const override {
+    return offset_;
+  }
+
+  std::unique_ptr<Buffer> release() override {
+    return std::move(buffer_);
+  }
+
+  std::span<uint8_t> wspan(std::size_t need) override {
+    return buffer_->wspan(need);
+  }
+
+  void commit(std::size_t size) override {
+    return buffer_->commit(size);
+  }
+
+  std::span<uint8_t const> rspan(std::size_t want) override {
+    auto ret = buffer_->rspan(offset_ + want);
+    if (ret.size() <= offset_)
+      return ret.subspan(0, 0);
+    return ret.subspan(offset_, ret.size() - offset_);
+  }
+
+  void consume(std::size_t size) override {
+    offset_ += size;
+  }
+
+  std::span<uint8_t> mspan(std::size_t want) override {
+    auto ret = buffer_->mspan(offset_ + want);
+    if (ret.size() <= offset_)
+      return ret.subspan(0, 0);
+    return ret.subspan(offset_, ret.size() - offset_);
+  }
+
+  std::size_t uncommit(std::size_t size) override {
+    return buffer_->uncommit(size);
+  }
+
+  bool empty() const override {
+    if (buffer_->empty())
+      return true;
+    auto data = buffer_->rspan(offset_ + 1);
+    return data.size() <= offset_;
+  }
+
+  bool full() const override {
+    return buffer_->full();
+  }
+
+  void reset() override {
+    offset_ = 0;
+  }
+
+ private:
+  std::unique_ptr<Buffer> buffer_;
+  std::size_t offset_{0};
+};
+
+}  // namespace
+
+std::unique_ptr<Buffer> make_buffer(std::size_t default_size,
+                                    std::size_t max_size) {
+  if (default_size >= max_size)
+    return std::make_unique<FixedBuffer>(max_size);
+
+  return std::make_unique<DynamicBuffer>(default_size, max_size);
+}
+
+std::unique_ptr<ReadViewBuffer> make_read_view_buffer(
+    std::unique_ptr<Buffer> buffer) {
+  return std::make_unique<ReadViewBufferImpl>(std::move(buffer));
+}
+
+std::size_t Buffer::write(std::span<uint8_t const> data) {
+  std::size_t offset = 0;
+  while (offset < data.size()) {
+    auto target = wspan();
+    if (target.empty())
+      break;
+    auto size = std::min(data.size() - offset, target.size());
+    std::copy_n(data.data() + offset, size, target.data());
+    commit(size);
+    offset += size;
+  }
+  return offset;
+}
+
+bool Buffer::write_all(std::span<uint8_t const> data) {
+  if (data.empty())
+    return true;
+  auto target = wspan(data.size());
+  if (target.empty())
+    return false;
+  std::copy(data.begin(), data.end(), target.begin());
+  commit(data.size());
+  return true;
+}
+
+std::size_t Buffer::read(std::span<uint8_t> data) {
+  std::size_t offset = 0;
+  while (offset < data.size()) {
+    auto source = rspan();
+    if (source.empty())
+      break;
+    auto size = std::min(data.size() - offset, source.size());
+    std::copy_n(source.data(), size, data.data() + offset);
+    consume(size);
+    offset += size;
+  }
+  return offset;
+}
+
+bool Buffer::read_all(std::span<uint8_t> data) {
+  auto source = rspan(data.size());
+  if (source.size() < data.size())
+    return false;
+  std::copy_n(source.begin(), data.size(), data.begin());
+  consume(data.size());
+  return true;
+}
+
+}  // namespace sax
+}  // namespace modxml
+
diff --git a/sax/src/buffer.hh b/sax/src/buffer.hh
new file mode 100644
index 0000000..d9fb9fc
--- /dev/null
+++ b/sax/src/buffer.hh
@@ -0,0 +1,108 @@
+#ifndef BUFFER_HH
+#define BUFFER_HH
+
+#include "macros.hh"
+
+#include <memory>
+#include <span>
+
+namespace modxml {
+namespace sax {
+
+class HIDDEN Buffer {
+ public:
+  virtual ~Buffer() = default;
+
+  Buffer(Buffer const&) = delete;
+  Buffer& operator=(Buffer const&) = delete;
+
+  // Returns a writable span, either at least need large or in case
+  // the buffer is full, an empty span.
+  // Returned span is valid until any other method is called on the buffer.
+  virtual std::span<uint8_t> wspan(std::size_t need = 1) = 0;
+  // Commit size data from the last returned wspan. size must be <= span.size.
+  // Remember that the span is now invalid and you need to call wspan again
+  // to write more.
+  virtual void commit(std::size_t size) = 0;
+
+  // Returns a readable span of all readily available data in buffer.
+  // If there is enought data in the buffer to satisfy want, the returned
+  // span is at least as large.
+  // Returned span is valid until any other method is called on the buffer.
+  virtual std::span<uint8_t const> rspan(std::size_t want = 1) = 0;
+  // Consume size data from buffer. size must be <= span.size.
+  // Remember that the span is now invalid and you need to call rspan again
+  // to read more.
+  virtual void consume(std::size_t size) = 0;
+
+  // Returns the same span as rspan but this is writable, you can modify
+  // the content. You cannot change the size of the span.
+  // If you wish to append data, use wspan() + commit().
+  // If you wish to remove data, use uncommit().
+  // If you wish to insert you have to be clever.
+  // Returned span is valid until any other method is called on the buffer.
+  virtual std::span<uint8_t> mspan(std::size_t want = 1) = 0;
+
+  // Uncommit the last size bytes in the buffer. Returns the bytes
+  // removed. If you used wspan() + commit() to add ten (10) bytes say and then
+  // call uncommit() with a size of seven (7) the first three (3) bytes written
+  // will the left in the buffer.
+  virtual std::size_t uncommit(std::size_t size) = 0;
+
+  // Returns true if buffer is empty.
+  virtual bool empty() const = 0;
+
+  // Returns true if buffer is full. This means filled to max_size.
+  virtual bool full() const = 0;
+
+  // Clear buffer, reset back to initial state.
+  virtual void reset() = 0;
+
+  // Write as much as possible of data to buffer.
+  // Returns bytes written (may be zero).
+  std::size_t write(std::span<uint8_t const> data);
+
+  // Either write all of the data to buffer or none. Returns true if data was
+  // written or data was empty.
+  bool write_all(std::span<uint8_t const> data);
+
+  // Read as much as possible from buffer to data.
+  // Returns bytes read (may be zero).
+  std::size_t read(std::span<uint8_t> data);
+
+  // Either fill data with data from buffer or return false.
+  bool read_all(std::span<uint8_t> data);
+
+ protected:
+  Buffer() = default;
+};
+
+// Create a buffer. default_size is used as an hint but generally that
+// will be the initial size of the buffer. max_size is an hard limit.
+// max_size == 0 is valid but will return an always full and empty buffer.
+std::unique_ptr<Buffer> HIDDEN make_buffer(std::size_t default_size,
+                                           std::size_t max_size);
+
+class ReadViewBuffer : public Buffer {
+ public:
+  // Returns bytes consumed in this buffer.
+  virtual std::size_t consumed() const = 0;
+
+  // Take ownership back of the wrapped buffer from the read view.
+  // The read view is now unusable.
+  virtual std::unique_ptr<Buffer> release() = 0;
+
+ protected:
+  ReadViewBuffer() = default;
+};
+
+// Create a read view buffer. Writing will go to wrapped buffer. Reading
+// is done on the read view buffer without moving the wrapped buffers read
+// pointer. These views are lightweight.
+std::unique_ptr<ReadViewBuffer> HIDDEN make_read_view_buffer(
+    std::unique_ptr<Buffer> buffer);
+
+}  // namespace sax
+}  // namespace modxml
+
+#endif  // BUFFER_HH
diff --git a/sax/src/decoder.cc b/sax/src/decoder.cc
index 30b1735..35b9b46 100644
--- a/sax/src/decoder.cc
+++ b/sax/src/decoder.cc
@@ -12,273 +12,233 @@ namespace sax {
 
 namespace {
 
-class UtfDecoder : public Decoder {
+class KnownEndianDecoder : public Decoder {
  public:
-  State decode(std::string_view in, std::size_t& in_offset,
-               uint32_t* out, std::size_t out_size,
-               std::size_t& out_offset) override {
-    std::size_t const out_start = out_offset;
+  State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+               std::span<uint8_t> out, std::size_t& out_offset) override {
+    std::size_t tmp = in_offset;
+    uint32_t ret = read(in, tmp);
+    if (ret == utf::NEED_MORE)
+      return State::NEED_MORE;
+    if (ret == utf::INVALID)
+      return State::INVALID;
+
     if (bom_ == -1) UNLIKELY {
-      std::size_t tmp = in_offset;
-      uint32_t ret = read(in, tmp);
-      if (ret == utf::NEED_MORE) {
-        return State::NEED_MORE;
-      }
-      if (ret == utf::INVALID) {
-        return State::INVALID;
-      }
       if (ret == 0xfeff) {
         // To allow offset to advance and to return, we need to
         // read at least one more character completely.
         ret = read(in, tmp);
-        if (ret == utf::NEED_MORE) {
+        if (ret == utf::NEED_MORE)
           return State::NEED_MORE;
-        }
-        if (ret == utf::INVALID) {
+        if (ret == utf::INVALID)
           return State::INVALID;
-        }
         bom_ = 1;
       } else {
         bom_ = 0;
       }
-      in_offset = tmp;
-      out[out_offset++] = ret;
-      if (out_offset == out_size)
-        return State::GOOD;
+      if (!utf::write8(ret, out, out_offset)) {
+        bom_ = -1;
+        return State::NEED_MORE;
+      }
+    } else {
+      if (!utf::write8(ret, out, out_offset))
+        return State::NEED_MORE;
     }
+    in_offset = tmp;
 
-    do {
-      uint32_t ret = read(in, in_offset);
-      if (ret == utf::NEED_MORE) {
-        return out_offset > out_start ? State::GOOD : State::NEED_MORE;
-      }
-      if (ret == utf::INVALID) {
-        return out_offset > out_start ? State::GOOD : State::INVALID;
-      }
-      out[out_offset++] = ret;
-    } while (out_offset < out_size);
-    return State::GOOD;
+    while (true) {
+      ret = read(in, tmp);
+      if (ret == utf::NEED_MORE || ret == utf::INVALID)
+        return State::GOOD;
+      if (!utf::write8(ret, out, out_offset))
+        return State::GOOD;
+      in_offset = tmp;
+    }
   }
 
  protected:
-  UtfDecoder() = default;
+  KnownEndianDecoder() = default;
 
-  virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0;
+  virtual uint32_t read(
+      std::span<uint8_t const> data, std::size_t& offset) const = 0;
 
  private:
   int8_t bom_{-1};
 };
 
-class Utf8Decoder : public UtfDecoder {
+class Utf8Decoder : public KnownEndianDecoder {
  public:
   Utf8Decoder() = default;
 
-  uint32_t read(std::string_view data, std::size_t& offset) const override {
+  uint32_t read(
+      std::span<uint8_t const> data, std::size_t& offset) const override {
     return utf::read8(data, offset);
   }
 };
 
-class Utf16BeDecoder : public UtfDecoder {
+class Utf16BeDecoder : public KnownEndianDecoder {
  public:
   Utf16BeDecoder() = default;
 
-  uint32_t read(std::string_view data, std::size_t& offset) const override {
+  uint32_t read(
+      std::span<uint8_t const> data, std::size_t& offset) const override {
     return utf::read16be(data, offset);
   }
 };
 
-class Utf16LeDecoder : public UtfDecoder {
+class Utf16LeDecoder : public KnownEndianDecoder {
  public:
   Utf16LeDecoder() = default;
 
-  uint32_t read(std::string_view data, std::size_t& offset) const override {
+  uint32_t read(
+      std::span<uint8_t const> data, std::size_t& offset) const override {
     return utf::read16le(data, offset);
   }
 };
 
-class Utf32BeDecoder : public UtfDecoder {
+class Utf32BeDecoder : public KnownEndianDecoder {
  public:
   Utf32BeDecoder() = default;
 
-  uint32_t read(std::string_view data, std::size_t& offset) const override {
+  uint32_t read(
+      std::span<uint8_t const> data, std::size_t& offset) const override {
     return utf::read32be(data, offset);
   }
 };
 
-class Utf32LeDecoder : public UtfDecoder {
+class Utf32LeDecoder : public KnownEndianDecoder {
  public:
   Utf32LeDecoder() = default;
 
-  uint32_t read(std::string_view data, std::size_t& offset) const override {
+  uint32_t read(
+      std::span<uint8_t const> data, std::size_t& offset) const override {
     return utf::read32le(data, offset);
   }
 };
 
-class Utf16Decoder : public Decoder {
+class UnknownEndianDecoder : public Decoder {
  public:
-  Utf16Decoder() = default;
-
-  State decode(std::string_view in, std::size_t& in_offset,
-               uint32_t* out, std::size_t out_size,
-               std::size_t& out_offset) override {
-    std::size_t const out_start = out_offset;
+  State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+               std::span<uint8_t> out, std::size_t& out_offset) override {
+    std::size_t tmp = in_offset;
     if (endian_ == -1) UNLIKELY {
-      std::size_t tmp = in_offset;
-      uint32_t ret = utf::read16be(in, tmp);
-      int8_t endian;
-      if (ret == utf::NEED_MORE) {
+      uint32_t ret = readbe(in, tmp);
+      if (ret == utf::NEED_MORE)
         return State::NEED_MORE;
-      }
-      if (ret == utf::INVALID) {
+      if (ret == utf::INVALID)
         return State::INVALID;
-      }
       if (ret == 0xfeff) {
-        endian = 1;  // Big endian
+        endian_ = 1;
       } else if (ret == 0xfffe) {
-        endian = 0;  // Little endian
+        endian_ = 0;
       } else {
         return State::INVALID;
       }
+      in_offset = tmp;
+    }
 
-      // To allow offset to advance and to return, we need to
-      // read at least one more character completely.
-      ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp);
-      if (ret == utf::NEED_MORE) {
+    if (endian_ == 0) {
+      uint32_t ret = readle(in, tmp);
+      if (ret == utf::NEED_MORE)
         return State::NEED_MORE;
-      }
-      if (ret == utf::INVALID) {
+      if (ret == utf::INVALID)
         return State::INVALID;
-      }
+      if (!utf::write8(ret, out, out_offset))
+        return State::NEED_MORE;
+      in_offset = tmp;
 
-      endian_ = endian;
+      while (true) {
+        ret = readle(in, tmp);
+        if (ret == utf::NEED_MORE || ret == utf::INVALID)
+          return State::GOOD;
+        if (!utf::write8(ret, out, out_offset))
+          return State::GOOD;
+        in_offset = tmp;
+      }
+    } else /* if (endian_ == 1) */ {
+      uint32_t ret = readbe(in, tmp);
+      if (ret == utf::NEED_MORE)
+        return State::NEED_MORE;
+      if (ret == utf::INVALID)
+        return State::INVALID;
+      if (!utf::write8(ret, out, out_offset))
+        return State::NEED_MORE;
       in_offset = tmp;
-      out[out_offset++] = ret;
-      if (out_offset == out_size)
-        return State::GOOD;
-    }
 
-    if (endian_ == 1) {
-      do {
-        uint32_t ret = utf::read16be(in, in_offset);
-        if (ret == utf::NEED_MORE) {
-          return out_offset > out_start ? State::GOOD : State::NEED_MORE;
-        }
-        if (ret == utf::INVALID) {
-          return out_offset > out_start ? State::GOOD : State::INVALID;
-        }
-        out[out_offset++] = ret;
-      } while (out_offset < out_size);
-    } else {
-      do {
-        uint32_t ret = utf::read16le(in, in_offset);
-        if (ret == utf::NEED_MORE) {
-          return out_offset > out_start ? State::GOOD : State::NEED_MORE;
-        }
-        if (ret == utf::INVALID) {
-          return out_offset > out_start ? State::GOOD : State::INVALID;
-        }
-        out[out_offset++] = ret;
-      } while (out_offset < out_size);
+      while (true) {
+        ret = readbe(in, tmp);
+        if (ret == utf::NEED_MORE || ret == utf::INVALID)
+          return State::GOOD;
+        if (!utf::write8(ret, out, out_offset))
+          return State::GOOD;
+        in_offset = tmp;
+      }
     }
-    return State::GOOD;
   }
 
+ protected:
+  UnknownEndianDecoder() = default;
+
+  virtual uint32_t readle(
+      std::span<uint8_t const> data, std::size_t& offset) const = 0;
+  virtual uint32_t readbe(
+      std::span<uint8_t const> data, std::size_t& offset) const = 0;
+
  private:
   int8_t endian_{-1};
 };
 
-class Utf32Decoder : public Decoder {
+class Utf16Decoder : public UnknownEndianDecoder {
  public:
-  Utf32Decoder() = default;
+  Utf16Decoder() = default;
 
-  State decode(std::string_view in, std::size_t& in_offset,
-               uint32_t* out, std::size_t out_size,
-               std::size_t& out_offset) override {
-    std::size_t const out_start = out_offset;
-    if (endian_ == -1) UNLIKELY {
-      std::size_t tmp = in_offset;
-      uint32_t ret = utf::read32be(in, tmp);
-      int8_t endian;
-      if (ret == utf::NEED_MORE) {
-        return State::NEED_MORE;
-      }
-      if (ret == utf::INVALID) {
-        tmp = in_offset;
-        ret = utf::read32le(in, tmp);
-        if (ret == 0xfeff) {
-          endian = 0;  // Little endian
-        } else {
-          return State::INVALID;
-        }
-      } else if (ret == 0xfeff) {
-        endian = 1;  // Big endian
-      } else {
-        return State::INVALID;
-      }
+  uint32_t readle(
+      std::span<uint8_t const> data, std::size_t& offset) const override {
+    return utf::read16le(data, offset);
+  }
 
-      // To allow offset to advance and to return, we need to
-      // read the next character completely.
-      ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp);
-      if (ret == utf::NEED_MORE) {
-        return State::NEED_MORE;
-      }
-      if (ret == utf::INVALID) {
-        return State::INVALID;
-      }
+  uint32_t readbe(
+      std::span<uint8_t const> data, std::size_t& offset) const override {
+    return utf::read16be(data, offset);
+  }
+};
 
-      endian_ = endian;
-      in_offset = tmp;
-      out[out_offset++] = ret;
-      if (out_offset == out_size)
-        return State::GOOD;
-    }
+class Utf32Decoder : public UnknownEndianDecoder {
+ public:
+  Utf32Decoder() = default;
 
-    if (endian_ == 1) {
-      do {
-        uint32_t ret = utf::read32be(in, in_offset);
-        if (ret == utf::NEED_MORE) {
-          return out_offset > out_start ? State::GOOD : State::NEED_MORE;
-        }
-        if (ret == utf::INVALID) {
-          return out_offset > out_start ? State::GOOD : State::INVALID;
-        }
-        out[out_offset++] = ret;
-      } while (out_offset < out_size);
-    } else {
-      do {
-        uint32_t ret = utf::read32le(in, in_offset);
-        if (ret == utf::NEED_MORE) {
-          return out_offset > out_start ? State::GOOD : State::NEED_MORE;
-        }
-        if (ret == utf::INVALID) {
-          return out_offset > out_start ? State::GOOD : State::INVALID;
-        }
-        out[out_offset++] = ret;
-      } while (out_offset < out_size);
-    }
-    return State::GOOD;
+  uint32_t readle(
+      std::span<uint8_t const> data, std::size_t& offset) const override {
+    return utf::read32le(data, offset);
   }
 
- private:
-  int8_t endian_{-1};
+  uint32_t readbe(
+      std::span<uint8_t const> data, std::size_t& offset) const override {
+    return utf::read32be(data, offset);
+  }
 };
 
 class AsciiDecoder : public Decoder {
  public:
   AsciiDecoder() = default;
 
-  State decode(std::string_view in, std::size_t& in_offset,
-               uint32_t* out, std::size_t out_size,
-               std::size_t& out_offset) override {
-    std::size_t const out_start = out_offset;
-    do {
-      if (in_offset == in.size())
-        return out_offset > out_start ? State::GOOD : State::NEED_MORE;
-      if (in[in_offset] & 0x80)
-        return out_offset > out_start ? State::GOOD : State::INVALID;
-      out[out_offset++] = in[in_offset++];
-    } while (out_offset < out_size);
-    return State::GOOD;
+  State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+               std::span<uint8_t> out, std::size_t& out_offset) override {
+    if (in_offset >= in.size())
+      return State::NEED_MORE;
+    if (in[in_offset] & 0x80)
+      return State::INVALID;
+    if (!utf::write8(in[in_offset], out, out_offset))
+      return State::NEED_MORE;
+    ++in_offset;
+
+    while (true) {
+      if (in_offset >= in.size() || in[in_offset] & 0x80)
+        return State::GOOD;
+      if (!utf::write8(in[in_offset], out, out_offset))
+        return State::GOOD;
+      ++in_offset;
+    }
   }
 };
 
diff --git a/sax/src/guessing_decoder.cc b/sax/src/guessing_decoder.cc
new file mode 100644
index 0000000..e72dab3
--- /dev/null
+++ b/sax/src/guessing_decoder.cc
@@ -0,0 +1,92 @@
+#include "guessing_decoder.hh"
+
+#include "decoder.hh"
+#include "sax_decoder.hh"
+#include "utf8.hh"
+#include "utf_error.hh"
+
+#include <cassert>
+
+using namespace std::string_view_literals;
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+bool eq(std::span<uint8_t const> a, std::size_t& a_offset, std::string_view b) {
+  if (a.size() - a_offset < b.size())
+    return false;
+  for (size_t i = 0; i < b.size(); ++i)
+    if (a[a_offset + i] != b[i])
+      return false;
+  return true;
+}
+
+class GuessingDecoder : public Decoder {
+ public:
+  State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+               std::span<uint8_t> out, std::size_t& out_offset) override {
+    assert(in_offset <= in.size());
+
+    if (!decided_) {
+      if (eq(in, in_offset, "\xef\xbb\xbf"sv)) {
+        decided_ = create_utf8_decoder();
+      } else if (eq(in, in_offset, "\xfe\xff\x00\x00"sv)) {
+        in_offset += 4;
+        decided_ = create_utf32be_decoder();
+      } else if (eq(in, in_offset, "\xfe\xff"sv)) {
+        // Could be UTF-32 BOM, need more data to decide
+        // (note, an xml document encoded in UTF-16 that is less than 4 bytes
+        //  is rather impossible).
+        if (in.size() - in_offset < 4)
+          return State::NEED_MORE;
+        in_offset += 2;
+        decided_ = create_utf16be_decoder();
+      } else if (eq(in, in_offset, "\xff\xfe"sv)) {
+        in_offset += 2;
+        decided_ = create_utf16le_decoder();
+      } else if (eq(in, in_offset, "\x00\x00\xff\xfe"sv)) {
+        in_offset += 4;
+        decided_ = create_utf32le_decoder();
+      } else {
+        auto avail = in.size() - in_offset;
+        if (avail == 0)
+          return State::NEED_MORE;
+        if (avail >= 4 && in[in_offset] == 0 && in[in_offset + 1] == 0
+            && in[in_offset + 2] == 0 && in[in_offset + 3] != 0) {
+          decided_ = create_utf32le_decoder();
+        } else if (avail >= 4 && in[in_offset] != 0 && in[in_offset + 1] == 0
+                   && in[in_offset + 2] == 0 && in[in_offset + 3] == 0) {
+          decided_ = create_utf32be_decoder();
+        } else if (avail >= 2 && in[in_offset] == 0 && in[in_offset + 1] != 0) {
+          decided_ = create_utf16le_decoder();
+        } else if (avail >= 2 && in[in_offset] != 0 && in[in_offset + 1] == 0) {
+          decided_ = create_utf16be_decoder();
+        } else {
+          auto tmp = in_offset;
+          auto ret = utf::read8(in, tmp);
+          if (ret == utf::NEED_MORE)
+            return State::NEED_MORE;
+          if (ret == utf::INVALID)
+            return State::INVALID;
+          // UTF-8 should be good enough to read the XML declaration.
+          decided_ = create_utf8_decoder();
+        }
+      }
+    }
+    return decided_->decode(in, in_offset, out, out_offset);
+  }
+
+ private:
+  std::unique_ptr<Decoder> decided_;
+};
+
+}  // namespace
+
+std::unique_ptr<Decoder> create_guessing_decoder() {
+  return std::make_unique<GuessingDecoder>();
+}
+
+}  // namespace sax
+}  // namespace modxml
diff --git a/sax/src/guessing_decoder.hh b/sax/src/guessing_decoder.hh
new file mode 100644
index 0000000..0f42c3b
--- /dev/null
+++ b/sax/src/guessing_decoder.hh
@@ -0,0 +1,21 @@
+#ifndef GUESSING_DECODER_HH
+#define GUESSING_DECODER_HH
+
+#include "macros.hh"
+
+#include <memory>
+
+namespace modxml {
+namespace sax {
+
+class Decoder;
+
+// Decoder that tries to figure out, using BOM or just magic
+// what encoding is used, optimized for the first character to be
+// '<'.
+std::unique_ptr<Decoder> HIDDEN create_guessing_decoder();
+
+}  // namespace sax
+}  // namespace modxml
+
+#endif  // GUESSING_DECODER_HH
diff --git a/sax/src/sax_attributes.cc b/sax/src/sax_attributes.cc
new file mode 100644
index 0000000..230c677
--- /dev/null
+++ b/sax/src/sax_attributes.cc
@@ -0,0 +1,38 @@
+#include "sax_attributes.hh"
+
+namespace modxml {
+namespace sax {
+
+Attribute::Attribute(std::string_view name, std::string_view value)
+    : name(name), value(value) {}
+
+std::optional<std::string_view> Attributes::find_first(std::string_view name)
+    const {
+  for (auto it = begin(); it != end(); ++it) {
+    if (it->name == name)
+      return it->value;
+  }
+  return std::nullopt;
+}
+
+std::optional<std::string_view> Attributes::find_last(std::string_view name)
+    const {
+  for (size_t i = size(); i > 0; --i) {
+    auto const& a = at(i - 1);
+    if (a.name == name)
+      return a.value;
+  }
+  return std::nullopt;
+}
+
+std::optional<std::size_t> Attributes::find(std::string_view name,
+                                            std::size_t index) const {
+  for (; index < size(); ++index) {
+    if (at(index).name == name)
+      return index;
+  }
+  return std::nullopt;
+}
+
+}  // namespace sax
+}  // namespace modxml
diff --git a/sax/src/sax_delegate.cc b/sax/src/sax_delegate.cc
new file mode 100644
index 0000000..2c2cfcd
--- /dev/null
+++ b/sax/src/sax_delegate.cc
@@ -0,0 +1,21 @@
+#include "sax_delegate.hh"
+
+namespace modxml {
+namespace sax {
+
+void Delegate::start_element(std::string_view, Attributes const&) {}
+
+void Delegate::empty_element(std::string_view, Attributes const&) {}
+
+void Delegate::end_element(std::string_view) {}
+
+void Delegate::character_data(std::string_view) {}
+
+void Delegate::processing_instruction(std::string_view, std::string_view) {}
+
+void Delegate::comment(std::string_view) {}
+
+void Delegate::error(std::string_view) {}
+
+}  // namespace sax
+}  // namespace modxml
diff --git a/sax/src/sax_processor.cc b/sax/src/sax_processor.cc
index ea9f753..afc9d3b 100644
--- a/sax/src/sax_processor.cc
+++ b/sax/src/sax_processor.cc
@@ -1,18 +1,41 @@
 #include "sax_processor.hh"
 
-#include "sax_decoder.hh"
+#include <iostream>
+
+#include "buffer.hh"
+#include "guessing_decoder.hh"
 #include "processor.hh"
+#include "sax_attributes.hh"
+#include "sax_decoder.hh"
+#include "sax_decoder_factory.hh"
+#include "sax_delegate.hh"
+#include "utf8.hh"
+#include "utf_error.hh"
 #include "utils.hh"
 
 #include <algorithm>
+#include <cassert>
+#include <charconv>
+#include <format>
+#include <map>
 #include <optional>
 #include <utility>
+#include <vector>
+
+using namespace std::string_view_literals;
 
 namespace modxml {
 namespace sax {
 
 namespace {
 
+constexpr std::size_t kDefaultBufferSize = 8192;
+constexpr std::size_t kMinBufferSize = 128;
+
+inline bool is_digit(char c) {
+  return c >= '0' && c <= '9';
+}
+
 // 2.2 Characters
 // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
 
@@ -75,12 +98,185 @@ inline bool is_namechar(uint32_t c) {
       (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040);
 }
 
-/* [5]   	Name	   ::=   	NameStartChar (NameChar)*
+/*
+[5]   	Name	   ::=   	NameStartChar (NameChar)*
 [6]   	Names	   ::=   	Name (#x20 Name)*
 [7]   	Nmtoken	   ::=   	(NameChar)+
 [8]   	Nmtokens	   ::=   	Nmtoken (#x20 Nmtoken)*
 */
 
+inline bool ascii_lowercase(char c) {
+  return (c >= 'A' & c <= 'Z') ? (c | 0x20) : c;
+}
+
+bool eq_lowercase(std::string_view a, std::string_view b) {
+  if (a.size() != b.size())
+    return false;
+  for (std::size_t i = 0; i < a.size(); ++i)
+    if (ascii_lowercase(a[i]) != b[i])
+      return false;
+  return true;
+}
+
+inline std::string_view make_string_view(std::span<uint8_t const> span) {
+  return std::string_view(reinterpret_cast<char const*>(span.data()),
+                          span.size());
+}
+
+class Entities {
+ public:
+  Entities() {
+    data_.emplace("lt", "<");
+    data_.emplace("gt", ">");
+    data_.emplace("amp", "&");
+    data_.emplace("apos", "'");
+    data_.emplace("quot", "\"");
+  }
+
+  std::optional<std::string> get(std::string const& entity) const {
+    if (entity.empty())
+      return std::nullopt;
+    if (entity.front() == '#') {
+      if (entity.size() == 1)
+        return std::nullopt;
+      int base;
+      char const* start;
+      char const* end = entity.data() + entity.size();
+      if (entity[1] == 'x') {
+        start = entity.data() + 2;
+        base = 16;
+      } else {
+        start = entity.data() + 1;
+        base = 10;
+      }
+      uint32_t value;
+      auto [ptr, ec] = std::from_chars(start, end, value, base);
+      if (ec == std::errc() && ptr == end) {
+        uint8_t tmp[4];
+        std::size_t offset = 0;
+        utf::write8(value, tmp, offset);
+        return std::string(reinterpret_cast<char*>(tmp), offset);
+      }
+    }
+    auto it = data_.find(entity);
+    if (it == data_.end())
+      return std::nullopt;
+    return it->second;
+  }
+
+ private:
+  std::map<std::string, std::string> data_;
+};
+
+bool deamp(Entities const& entities, std::string& str, std::size_t last = 0) {
+  while (true) {
+    auto next = str.find('&', last);
+    if (next == std::string::npos)
+      break;
+    next += 1;
+    auto semicolon = str.find(';', next);
+    if (semicolon == std::string::npos)
+      return false;
+    auto replacement = entities.get(str.substr(next, semicolon - next));
+    if (!replacement.has_value())
+      return false;
+  }
+  return true;
+}
+
+std::optional<std::string> unquote(Entities const& entities,
+                                   std::string_view quoted) {
+  assert(quoted.size() >= 2);
+  assert(quoted.front() == quoted.back());
+  std::string ret(quoted.substr(1, quoted.size() - 2));
+  if (deamp(entities, ret))
+    return ret;
+  return std::nullopt;
+}
+
+std::optional<std::string_view> unquote_if_needed(Entities const& entities,
+                                                  std::string_view quoted,
+                                                  std::string& tmp) {
+  assert(quoted.size() >= 2);
+  assert(quoted.front() == quoted.back());
+  auto input = quoted.substr(1, quoted.size() - 2);
+  auto index = input.find('&');
+  if (index == std::string_view::npos)
+    return input;
+  tmp.assign(input);
+  if (deamp(entities, tmp, index))
+    return tmp;
+  return std::nullopt;
+}
+
+class AttributesImpl : public Attributes {
+ public:
+  AttributesImpl() = default;
+
+  bool init(Entities const& entities,
+            std::span<const uint8_t> data,
+            std::vector<size_t> const& offsets,
+            std::size_t first) {
+    std::size_t a = first;
+    attr_.reserve((offsets.size() - first) / 4);
+    while (a + 4 <= offsets.size()) {
+      auto name = make_string_view(data.subspan(offsets[a], offsets[a + 1]));
+      std::string tmp;
+      auto value = unquote_if_needed(
+          entities,
+          make_string_view(data.subspan(offsets[a + 2], offsets[a + 3])),
+          tmp);
+      if (!value.has_value())
+        return false;
+      if (tmp.empty()) {
+        attr_.emplace_back(name, *value);
+      } else {
+        attr_.emplace_back(name, *value, std::move(tmp));
+      }
+      a += 4;
+    }
+    return true;
+  }
+
+  iterator begin() const override {
+    return Iterator(this, 0);
+  }
+
+  iterator end() const override {
+    return Iterator(this, attr_.size());
+  }
+
+  std::size_t size() const override {
+    return attr_.size();
+  }
+
+  Attribute const& at(std::size_t index) const override {
+    return attr_[index];
+  }
+
+ private:
+  class Iterator : public iterator {
+   public:
+    Iterator(Attributes const* attributes, std::size_t index)
+        : iterator(attributes, index) {}
+  };
+
+  struct AttributeImpl : public Attribute {
+    AttributeImpl(std::string_view name, std::string_view value)
+        : Attribute(name, value) {}
+
+    AttributeImpl(std::string_view name, std::string_view value,
+                  std::string&& tmp)
+        : Attribute(name, value), tmp_(std::move(tmp)) {}
+
+   private:
+    std::string tmp_;
+  };
+
+  std::span<const uint8_t> data_;
+  std::vector<AttributeImpl> attr_;
+};
+
 class ProcessorImpl : public Processor {
  public:
   ProcessorImpl(std::shared_ptr<Delegate> delegate,
@@ -91,15 +287,898 @@ class ProcessorImpl : public Processor {
       : delegate_(std::move(delegate)),
         decoder_factory_(std::move(decoder_factory)),
         decoder_(std::move(decoder)),
-        default_buffer_size_(default_buffer_size),
-        max_buffer_size_(max_buffer_size) {}
+        forced_decoder_(decoder_),
+        buffer_(make_buffer(default_buffer_size, max_buffer_size)) {
+    if (!decoder_)
+      decoder_ = create_guessing_decoder();
+
+    expect_document();
+  }
+
+  std::size_t process(std::span<uint8_t const> data,
+                      std::size_t offset) override {
+    cmds_.emplace_back(Command::FILL_BUFFER, Count::ZERO_OR_ONE);
+
+    std::size_t consumed = 0;
+
+    while (true) {
+      if (cmds_.empty()) {
+        if (!buffer_->empty()) {
+          std::cerr << make_string_view(buffer_->rspan()) << std::endl;
+          delegate_->error("Extra data at end");
+        }
+        return consumed;
+      }
+
+      auto current = cmds_.back();
+      auto const old_size = cmds_.size();
+      cmds_.pop_back();
+      Process ret;
+      switch (current.command) {
+        case Command::FILL_BUFFER:
+          ret = fill_buffer(data, offset, consumed);
+          break;
+        case Command::MISC:
+          ret = process_misc(current);
+          break;
+        case Command::SPACE:
+          ret = process_space(current);
+          break;
+        case Command::ELEMENT:
+          ret = process_element(current);
+          break;
+        case Command::COMMENT:
+          ret = process_comment(current);
+          break;
+        case Command::PROCESSING_INSTRUCTION:
+          ret = process_processing_instruction(current);
+          break;
+        case Command::XMLDECL:
+          ret = process_xmldecl(current);
+          break;
+        case Command::ATTRIBUTE:
+          ret = process_attribute(current);
+          break;
+        case Command::NAME:
+          ret = process_name(current);
+          break;
+        case Command::ATTRIBUTE_VALUE:
+          ret = process_attribute_value(current);
+          break;
+        case Command::EQUAL:
+          ret = process_equal(current);
+          break;
+        case Command::START_OR_EMPTY_TAG:
+          ret = process_start_or_empty_tag(current);
+          break;
+        case Command::END_TAG:
+          ret = process_end_tag(current);
+          break;
+      }
+
+      switch (ret) {
+        case Process::NEED_MORE:
+        case Process::ERROR:
+          cmds_.push_back(current);
+          assert(cmds_.size() == old_size);
+          return consumed;
+        case Process::CONTINUE:
+          break;
+      }
+    }
+  }
+
+  uint64_t line() const override { return line_; }
+
+  uint64_t column() const override { return column_; }
 
  private:
+  enum class Process {
+    NEED_MORE,
+    ERROR,
+    CONTINUE,
+  };
+
+  enum class Match {
+    FULL_MATCH,
+    PARTIAL_MATCH,
+    NO_MATCH,
+  };
+
+  enum class Command {
+    FILL_BUFFER,
+
+    ATTRIBUTE,
+    ATTRIBUTE_VALUE,
+    COMMENT,
+    ELEMENT,
+    END_TAG,
+    EQUAL,
+    MISC,
+    NAME,
+    PROCESSING_INSTRUCTION,
+    SPACE,
+    START_OR_EMPTY_TAG,
+    XMLDECL,
+  };
+
+  enum class Count {
+    ONE,
+    ONE_OR_MANY,
+    ZERO_OR_ONE,
+    ZERO_OR_MANY,
+  };
+
+  struct CommandItem {
+    Command const command;
+    Count const count;
+    std::size_t offset;
+
+    CommandItem(Command command, Count count, std::size_t offset = 0)
+        : command(command), count(count), offset(offset) {}
+  };
+
+  struct StackItem {
+    std::vector<std::size_t> offsets;
+  };
+
+  Process fill_buffer(std::span<uint8_t const> data,
+                      std::size_t offset,
+                      std::size_t& consumed) {
+    if (offset >= data.size())
+      return Process::NEED_MORE;
+
+    std::size_t tmp = offset;
+    auto wspan = buffer_->wspan(4);
+    switch (decoder_->decode(data, tmp, wspan, consumed)) {
+      case Decoder::State::GOOD:
+        break;
+      case Decoder::State::NEED_MORE:
+        return Process::NEED_MORE;
+      case Decoder::State::INVALID:
+        delegate_->error("Invalid data");
+        return Process::ERROR;
+    }
+    buffer_->commit(consumed);
+    return Process::CONTINUE;
+  }
+
+  void expect_document() {
+    // document := prolog element Misc*
+    expect_misc(Count::ZERO_OR_MANY);
+    expect_element(Count::ONE);
+    expect_prolog();
+  }
+
+  void expect_misc(Count count) {
+    cmds_.emplace_back(Command::MISC, count);
+  }
+
+  void expect_element(Count count) {
+    // element ::= EmptyElemTag | STag content ETag
+    cmds_.emplace_back(Command::START_OR_EMPTY_TAG, count);
+  }
+
+  void expect_end_tag(Count count) {
+    cmds_.emplace_back(Command::END_TAG, count);
+  }
+
+  void expect_prolog() {
+    // prolog := XMLDecl? Misc* (doctypedecl Misc*)?
+    expect_misc(Count::ZERO_OR_MANY);
+    expect_doctypedecl(Count::ZERO_OR_ONE);
+    expect_misc(Count::ZERO_OR_MANY);
+    expect_xmldecl(Count::ZERO_OR_ONE);
+  }
+
+  void expect_xmldecl(Count count) {
+    cmds_.emplace_back(Command::XMLDECL, count);
+  }
+
+  void expect_doctypedecl(Count) {
+    // TODO
+  }
+
+  void expect_comment(Count count, std::size_t start_offset = 0) {
+    // Comment should never be more than one, should be MISC that is repeated.
+    assert(count == Count::ONE);
+    cmds_.emplace_back(Command::COMMENT, count, start_offset);
+  }
+
+  void expect_content(Count) {
+    // TODO
+  }
+
+  void expect_pi(Count count, std::size_t start_offset = 0) {
+    // PI should never be more than one, should be MISC that is repeated.
+    assert(count == Count::ONE);
+    cmds_.emplace_back(Command::PROCESSING_INSTRUCTION, count, start_offset);
+  }
+
+  void expect_space(Count count) {
+    // There is not way to have SS as S is continous, so we should never
+    // ask for more than one or zero.
+    assert(count == Count::ZERO_OR_ONE || count == Count::ONE);
+    cmds_.emplace_back(Command::SPACE, count);
+  }
+
+  void expect_attribute(Count count) {
+    switch (count) {
+      case Count::ONE_OR_MANY:
+        cmds_.emplace_back(Command::ATTRIBUTE, Count::ZERO_OR_MANY);
+      case Count::ONE:
+        // Attribute ::= Name Eq AttValue
+        expect_attribute_value(Count::ONE);
+        expect_equal(Count::ONE);
+        expect_name(Count::ONE);
+        expect_space(Count::ONE);
+        break;
+      case Count::ZERO_OR_ONE:
+      case Count::ZERO_OR_MANY:
+        cmds_.emplace_back(Command::ATTRIBUTE, count);
+        break;
+    }
+  }
+
+  void expect_attribute_value(Count count) {
+    cmds_.emplace_back(Command::ATTRIBUTE_VALUE, count);
+  }
+
+  void expect_equal(Count count) {
+    // Eq ::= S? '=' S?
+    expect_space(Count::ZERO_OR_ONE);
+    cmds_.emplace_back(Command::EQUAL, count);
+    expect_space(Count::ZERO_OR_ONE);
+  }
+
+  void expect_name(Count count) {
+    cmds_.emplace_back(Command::NAME, count);
+  }
+
+  Process process_misc(CommandItem const& item) {
+    // Misc := Comment | PI | S
+    assert(item.offset == 0);
+
+    switch (match("<!--")) {
+      case Match::FULL_MATCH:
+        add_if_more(item);
+        expect_comment(Count::ONE, 3);
+        return Process::CONTINUE;
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+      case Match::NO_MATCH:
+        break;
+    }
+
+    switch (match("<?")) {
+      case Match::FULL_MATCH:
+        add_if_more(item);
+        expect_pi(Count::ONE, 2);
+        return Process::CONTINUE;
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+      case Match::NO_MATCH:
+        break;
+    }
+
+    switch (match_s()) {
+      case Match::FULL_MATCH:
+        add_if_more(item);
+        expect_space(Count::ONE);
+        return Process::CONTINUE;
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+      case Match::NO_MATCH:
+        break;
+    }
+
+    return no_match(item);
+  }
+
+  Process process_attribute(CommandItem& item) {
+    // This actually parses (S Attribute)* when followed by S?
+    // for Attribute parsing see expect_attribute()
+    // So we need to figure out if the S means start of attribute
+    // or just an S. We do this by checking if the first non-S is
+    // a namestart or something else. We consume the S.
+    uint32_t last_char;
+    auto ret = consume_space(item.offset, last_char);
+    if (ret != Process::CONTINUE)
+      return ret;
+
+    // No S, cannot be followed by an attribute then.
+    if (item.offset == 0)
+      return no_match(item);
+
+    // First character after S isn't a valid first character of a name,
+    // cannot be followed by an attribute then.
+    if (!is_namestartchar(last_char))
+      return no_match(item);
+
+    expect_attribute_value(Count::ONE);
+    expect_equal(Count::ONE);
+    expect_name(Count::ONE);
+    return Process::CONTINUE;
+  }
+
+  Process process_equal(CommandItem const& item) {
+    // Eq ::= S? '=' S?
+    // Spacing added by expect_equal
+    switch (match_consume("=")) {
+      case Match::FULL_MATCH:
+        add_if_more(item);
+        return Process::CONTINUE;
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+      case Match::NO_MATCH:
+        return no_match(item);
+    }
+  }
+
+  Process process_name(CommandItem& item) {
+	// Name ::= NameStartChar (NameChar)*
+    auto data = buffer_->rspan(item.offset + 4);
+    while (true) {
+      std::size_t tmp = item.offset;
+      auto c = utf::read8(data, tmp);
+      if (c == utf::NEED_MORE)
+        return Process::NEED_MORE;
+      if (c == utf::INVALID || !valid_char(c))
+        return invalid_char(data, tmp);
+      if (item.offset == 0) {
+        if (!is_namestartchar(c))
+          return no_match(item);
+      } else {
+        if (!is_namechar(c))
+          break;
+      }
+      item.offset = tmp;
+    }
+
+    assert(!stack_.empty());
+    auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get());
+    stack_.back().offsets.push_back(read_view->consumed());
+    stack_.back().offsets.push_back(item.offset);
+    buffer_->consume(item.offset);
+    return Process::CONTINUE;
+  }
+
+  Process process_attribute_value(CommandItem& item) {
+    // AttValue ::= '"' ([^<&"] | Reference)* '"'
+    //              | "'" ([^<&'] | Reference)* "'"
+
+    uint32_t end_char;
+    auto data = buffer_->rspan(item.offset + 4);
+
+    if (item.offset == 0) {
+      std::size_t tmp = item.offset;
+      auto c = utf::read8(data, tmp);
+      if (c == utf::NEED_MORE)
+        return Process::NEED_MORE;
+      if (c == utf::INVALID || !valid_char(c))
+        return invalid_char(data, tmp);
+      if (c != '"' && c != '\'')
+        return no_match(item);
+      item.offset = tmp;
+      end_char = c;
+    } else {
+      assert(!data.empty());
+      end_char = data[0];  // ok as both " and ' are ASCII
+    }
+
+    while (true) {
+      auto c = utf::read8(data, item.offset);
+      if (c == utf::NEED_MORE)
+        return Process::NEED_MORE;
+      if (c == utf::INVALID || !valid_char(c))
+        return invalid_char(data, item.offset);
+      if (c == end_char)
+        break;
+      // TODO: Should we validate reference already here or do we let
+      // unquoute take care of that? As Reference can't contain end_char
+      // only checking for end_char is safe here.
+    }
+
+    assert(!stack_.empty());
+    auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get());
+    stack_.back().offsets.push_back(read_view->consumed());
+    stack_.back().offsets.push_back(item.offset);
+    buffer_->consume(item.offset);
+    return Process::CONTINUE;
+  }
+
+  Process process_comment(CommandItem& item) {
+    if (item.offset == 0) {
+      switch (match_consume("<!--")) {
+        case Match::FULL_MATCH:
+          item.offset += 3;
+          break;
+        case Match::PARTIAL_MATCH:
+          return Process::NEED_MORE;
+        case Match::NO_MATCH:
+          return no_match(item);
+      }
+    }
+
+    auto match = find("-->", item.offset);
+    switch (match) {
+      case Match::FULL_MATCH: {
+        auto data = buffer_->rspan(item.offset);
+        assert(data.size() >= item.offset);
+        delegate_->comment(
+            make_string_view(data.subspan(3, item.offset - 3)));
+        buffer_->consume(item.offset + 3);
+        return Process::CONTINUE;
+      }
+      case Match::NO_MATCH:
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+    }
+  }
+
+  Process process_processing_instruction(CommandItem& item) {
+    if (item.offset == 0) {
+      switch (match_consume("<?")) {
+        case Match::FULL_MATCH:
+          item.offset += 2;
+          break;
+        case Match::PARTIAL_MATCH:
+          return Process::NEED_MORE;
+        case Match::NO_MATCH:
+          return no_match(item);
+      }
+    }
+
+    // TODO
+    delegate_->error("PI not supported");
+    return Process::ERROR;
+  }
+
+  void add_to_stack(CommandItem const& item, std::size_t offset) {
+    cmds_.emplace_back(item.command, item.count, offset);
+    stack_.emplace_back();
+    buffer_ = make_read_view_buffer(std::move(buffer_));
+    buffer_->consume(offset);
+  }
+
+  std::size_t pop_stack(std::vector<std::size_t>& attr) {
+    assert(!stack_.empty());
+    std::swap(attr, stack_.back().offsets);
+
+    auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get());
+    auto consumed = read_view->consumed();
+
+    buffer_ = read_view->release();
+    stack_.pop_back();
+
+    return consumed;
+  }
+
+  Process process_xmldecl(CommandItem const& item) {
+    // XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
+    if (item.offset == 0) {
+      switch (match("<?xml")) {
+        case Match::FULL_MATCH:
+          add_to_stack(item, /* offset */ 5);
+          expect_space(Count::ZERO_OR_ONE);
+          // Parsing as generic "Attribute" here and doing validation later.
+          expect_attribute(Count::ONE_OR_MANY);
+          return Process::CONTINUE;
+        case Match::PARTIAL_MATCH:
+          return Process::NEED_MORE;
+        case Match::NO_MATCH:
+          return no_match(item);
+      }
+    }
+
+    assert(item.offset == 5);
+
+    // Remember that this is still reading for the read view buffer.
+    switch (match_consume("?>")) {
+      case Match::FULL_MATCH:
+        break;
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+      case Match::NO_MATCH:
+        delegate_->error(std::format("Expected end of {}",
+                                     command_name(item.command)));
+        return Process::ERROR;
+    }
+
+    std::vector<std::size_t> attr;
+    auto const consumed = pop_stack(attr);
+
+    // Now we're back to the real buffer
+    auto data = buffer_->rspan(consumed);
+    std::size_t a = 0;
+
+    if (a + 4 <= attr.size() &&
+        make_string_view(data.subspan(attr[a + 0],
+                                      attr[a + 1])) == "version") {
+      auto version = make_string_view(data.subspan(attr[a + 2] + 1,
+                                                   attr[a + 3] - 2));
+      if (!valid_version(version)) {
+        delegate_->error(std::format("Unsupported xmldecl version, {}",
+                                     version));
+        return Process::ERROR;
+      }
+      a += 4;
+    } else {
+      // No version
+      delegate_->error("Invalid xmldecl, must have a version attribute first.");
+      return Process::ERROR;
+    }
+
+    if (a + 4 <= attr.size() &&
+        make_string_view(data.subspan(attr[a + 0],
+                                      attr[a + 1])) == "encoding") {
+      auto encoding = make_string_view(data.subspan(attr[a + 2] + 1,
+                                                    attr[a + 3] - 2));
+      if (forced_decoder_) {
+        // encoding value is ignored
+        // TODO: Should we check that it is valid anyway?
+      } else {
+        auto decoder = pick_decoder_for_encoding(encoding, nullptr);
+        if (!decoder && decoder_factory_)
+          decoder = decoder_factory_->create(encoding);
+        if (!decoder) {
+          delegate_->error(std::format("Unknown encoding {}", encoding));
+          return Process::ERROR;
+        }
+        std::swap(decoder_, decoder);
+        // TODO: Re-decode the rest of the buffer?
+      }
+      a += 4;
+    }
+
+    if (a + 4 <= attr.size() &&
+        make_string_view(data.subspan(attr[a + 0],
+                                      attr[a + 1])) == "standalone") {
+      auto sd = make_string_view(data.subspan(attr[a + 2] + 1,
+                                              attr[a + 3] - 2));
+      if (sd == "yes") {
+        // TODO: Handle standalone == yes
+      } else if (sd == "no") {
+        // TODO: Handle standalone == no
+      } else {
+        delegate_->error(std::format(
+            "Invalid xmldecl, standalone attribute has unsupported value, {}",
+            sd));
+        return Process::ERROR;
+      }
+      a += 4;
+    }
+
+    if (a < attr.size()) {
+      delegate_->error(
+          std::format("Invalid xmldecl, unknown attribute, {}",
+                      make_string_view(data.subspan(attr[a + 0],
+                                                    attr[a + 1]))));
+      return Process::ERROR;
+    }
+
+    buffer_->consume(consumed);
+    return Process::CONTINUE;
+  }
+
+  Process process_start_or_empty_tag(CommandItem const& item) {
+    // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
+    // STag         ::=	'<' Name (S Attribute)* S? '>'
+    if (item.offset == 0) {
+      switch (match("<")) {
+        case Match::FULL_MATCH:
+          add_to_stack(item, /* offset */ 1);
+          expect_space(Count::ZERO_OR_ONE);
+          expect_attribute(Count::ZERO_OR_MANY);
+          expect_name(Count::ONE);
+          return Process::CONTINUE;
+        case Match::PARTIAL_MATCH:
+          return Process::NEED_MORE;
+        case Match::NO_MATCH:
+          return no_match(item);
+      }
+    }
+
+    assert(item.offset == 1);
+
+    bool empty_tag;
+
+    // Remember that this is still reading for the read view buffer.
+    switch (match_consume("/>")) {
+      case Match::FULL_MATCH:
+        empty_tag = true;
+        break;
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+      case Match::NO_MATCH:
+        switch (match_consume(">")) {
+          case Match::FULL_MATCH:
+            empty_tag = false;
+            break;
+          case Match::PARTIAL_MATCH:
+            return Process::NEED_MORE;
+          case Match::NO_MATCH:
+            delegate_->error(std::format("Expected end of {}",
+                                         command_name(item.command)));
+            return Process::ERROR;
+        }
+        break;
+    }
+
+    std::vector<std::size_t> attr;
+    auto const consumed = pop_stack(attr);
+
+    // Now we're back to the real buffer
+    auto data = buffer_->rspan(consumed);
+
+    assert(attr.size() >= 2);
+    auto name = make_string_view(data.subspan(attr[0], attr[1]));
+
+    AttributesImpl attributes;
+    if (!attributes.init(entities_, data, std::move(attr), 2)) {
+      delegate_->error("Invalid references in attribute values");
+      return Process::ERROR;
+    }
+
+    add_if_more(item);
+
+    if (empty_tag) {
+      delegate_->empty_element(name, attributes);
+    } else {
+      delegate_->start_element(name, attributes);
+      expect_end_tag(Count::ONE);
+      expect_content(Count::ONE);
+    }
+
+    buffer_->consume(consumed);
+    return Process::CONTINUE;
+  }
+
+  Process process_end_tag(CommandItem const& item) {
+    // ETag ::=	'</' Name S? '>'
+    if (item.offset == 0) {
+      switch (match("</")) {
+        case Match::FULL_MATCH:
+          add_to_stack(item, /* offset */ 2);
+          expect_space(Count::ZERO_OR_ONE);
+          expect_name(Count::ONE);
+          return Process::CONTINUE;
+        case Match::PARTIAL_MATCH:
+          return Process::NEED_MORE;
+        case Match::NO_MATCH:
+          return no_match(item);
+      }
+    }
+
+    assert(item.offset == 1);
+
+    // Remember that this is still reading for the read view buffer.
+    switch (match_consume(">")) {
+      case Match::FULL_MATCH:
+        break;
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+      case Match::NO_MATCH:
+        delegate_->error(std::format("Expected end of {}",
+                                     command_name(item.command)));
+        return Process::ERROR;
+    }
+
+    std::vector<std::size_t> attr;
+    auto const consumed = pop_stack(attr);
+
+    // Now we're back to the real buffer
+    auto data = buffer_->rspan(consumed);
+
+    assert(attr.size() == 2);
+    auto name = make_string_view(data.subspan(attr[0], attr[1]));
+
+    add_if_more(item);
+
+    delegate_->end_element(name);
+
+    buffer_->consume(consumed);
+    return Process::CONTINUE;
+  }
+
+  static bool valid_version(std::string_view version) {
+    if (version.size() < 3)
+      return false;
+    if (!version.starts_with("1."))
+      return false;
+    for (std::size_t i = 2; i < version.size(); ++i) {
+      if (!is_digit(version[i]))
+        return false;
+    }
+    return true;
+  }
+
+  Process process_element(CommandItem& item) {
+    // TODO
+    delegate_->error("Element is not yet supported");
+    return Process::ERROR;
+  }
+
+  Process consume_space(std::size_t& count, uint32_t& last_char) {
+    auto data = buffer_->rspan(4);
+    std::size_t consumed = 0;
+    while (true) {
+      std::size_t offset = consumed;
+      auto c = utf::read8(data, offset);
+      if (c == utf::NEED_MORE) {
+        buffer_->consume(consumed);
+        return Process::NEED_MORE;
+      }
+      if (c == utf::INVALID || !valid_char(c))
+        return invalid_char(data, offset);
+      if (!is_ws(c)) {
+        last_char = c;
+        buffer_->consume(consumed);
+        return Process::CONTINUE;
+      }
+      ++count;
+      handle_ws(c);
+      consumed = offset;
+    }
+  }
+
+  Process process_space(CommandItem& item) {
+    // S ::= (#x20 | #x9 | #xD | #xA)+
+    // item.offset is only used to count spaces. We consume each space as it
+    // is found so no offset in buffer.
+    uint32_t unused;
+    auto ret = consume_space(item.offset, unused);
+    if (ret != Process::CONTINUE)
+      return ret;
+
+    if (item.offset == 0)
+      return no_match(item);
+
+    add_if_more(item);
+    return Process::CONTINUE;
+  }
+
+  void add_if_more(CommandItem const& item) {
+    switch (item.count) {
+      case Count::ONE:
+        break;
+      case Count::ONE_OR_MANY:
+        cmds_.emplace_back(item.command, Count::ZERO_OR_MANY);
+        break;
+      case Count::ZERO_OR_ONE:
+        break;
+      case Count::ZERO_OR_MANY:
+        cmds_.emplace_back(item.command, item.count);
+    }
+  }
+
+  Match find(std::string_view str, std::size_t& offset) {
+    auto data = buffer_->rspan(offset + str.size());
+    std::size_t i = 0;
+    while (offset < data.size()) {
+      if (str[i] == data[offset]) {
+        ++i;
+        if (i == str.size()) {
+          offset -= i;
+          return Match::FULL_MATCH;
+        }
+      } else {
+        i = 0;
+      }
+      ++offset;
+    }
+    if (i > 0) {
+      offset -= i;
+      return Match::PARTIAL_MATCH;
+    }
+    return Match::NO_MATCH;
+  }
+
+  Match match(std::string_view str, std::size_t offset = 0) {
+    auto data = buffer_->rspan(offset + str.size());
+    if (data.size() <= offset)
+      return Match::PARTIAL_MATCH;
+    auto const avail = std::min(str.size(), data.size() - offset);
+    for (std::size_t i = 0; i < avail; ++i) {
+      if (str[i] != data[offset + i])
+        return Match::NO_MATCH;
+    }
+    if (avail < str.size())
+      return Match::PARTIAL_MATCH;
+    return Match::FULL_MATCH;
+  }
+
+  Match match_consume(std::string_view str) {
+    auto ret = match(str);
+    if (ret == Match::FULL_MATCH)
+      buffer_->consume(str.size());
+    return ret;
+  }
+
+  Match match_s() {
+    auto data = buffer_->rspan(4);
+    std::size_t offset = 0;
+    auto c = utf::read8(data, offset);
+    if (c == utf::NEED_MORE)
+      return data.size() == 0 ? Match::PARTIAL_MATCH : Match::NO_MATCH;
+    if (c == utf::INVALID)
+      return Match::NO_MATCH;
+    if (!valid_char(c) || !is_ws(c))
+      return Match::NO_MATCH;
+    return Match::FULL_MATCH;
+  }
+
+  Process no_match(CommandItem const& item) {
+    switch (item.count) {
+      case Count::ONE:
+      case Count::ONE_OR_MANY:
+        delegate_->error(std::format("Expected {}",
+                                     command_name(item.command)));
+        return Process::ERROR;
+      case Count::ZERO_OR_ONE:
+      case Count::ZERO_OR_MANY:
+        break;
+    }
+    return Process::CONTINUE;
+  }
+
+  void handle_ws(uint32_t c) {
+    if (c == '\n') {
+      ++line_;
+      column_ = 0;
+    } else {
+      ++column_;
+    }
+  }
+
+  Process invalid_char(std::span<uint8_t const> data, std::size_t offset) {
+    delegate_->error(std::format("Invalid char {:02x}", data[offset]));
+    return Process::ERROR;
+  }
+
+  static std::string_view command_name(Command command) {
+    switch (command) {
+      case Command::MISC:
+        return "misc"sv;
+      case Command::FILL_BUFFER:
+        return "more data"sv;
+      case Command::ELEMENT:
+        return "element"sv;
+      case Command::SPACE:
+        return "whitespace"sv;
+      case Command::COMMENT:
+        return "comment"sv;
+      case Command::PROCESSING_INSTRUCTION:
+        return "processing instruction"sv;
+      case Command::XMLDECL:
+        return "xml declaration"sv;
+      case Command::ATTRIBUTE:
+        return "attribute"sv;
+      case Command::ATTRIBUTE_VALUE:
+        return "attribute value"sv;
+      case Command::NAME:
+        return "name"sv;
+      case Command::EQUAL:
+        return "equal sign (=)"sv;
+      case Command::START_OR_EMPTY_TAG:
+        return "element"sv;
+      case Command::END_TAG:
+        return "end tag"sv;
+    }
+    assert(false);
+    return {};
+  }
+
   std::shared_ptr<Delegate> delegate_;
   std::shared_ptr<DecoderFactory> decoder_factory_;
   std::unique_ptr<Decoder> decoder_;
-  std::size_t default_buffer_size_;
-  std::size_t max_buffer_size_;
+  bool const forced_decoder_;
+  std::unique_ptr<Buffer> buffer_;
+  Entities entities_;
+  std::vector<CommandItem> cmds_;
+  std::vector<StackItem> stack_;
+  uint64_t line_{1};
+  uint64_t column_{0};
 };
 
 }  // namespace
@@ -117,9 +1196,9 @@ std::unique_ptr<Processor> create_processor(
                                         decoder_factory.get());
   }
 
-  std::size_t default_buffer_size = 8192;
+  std::size_t default_buffer_size = kDefaultBufferSize;
   if (opt_default_buffer_size.has_value())
-    default_buffer_size = std::max(static_cast<std::size_t>(128),
+    default_buffer_size = std::max(kMinBufferSize,
                                    opt_default_buffer_size.value());
   // This value is documented in public headers. Do NOT change.
   std::size_t max_buffer_size = 10 * 1024 * 1024;
@@ -136,7 +1215,8 @@ std::unique_ptr<Processor> create_processor(
                                          max_buffer_size);
 }
 
-std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate) {
+std::unique_ptr<Processor>
+Processor::create(std::shared_ptr<Delegate> delegate) {
   return create_processor(std::move(delegate), nullptr,
                           std::nullopt, std::nullopt, std::nullopt);
 }
diff --git a/sax/src/utils.cc b/sax/src/utils.cc
index f0366d5..e3a53b1 100644
--- a/sax/src/utils.cc
+++ b/sax/src/utils.cc
@@ -9,7 +9,7 @@ namespace sax {
 
 namespace {
 
-std::string cleanup_encoding(std::string const& str) {
+std::string cleanup_encoding(std::string_view str) {
   std::string ret;
   ret.reserve(str.size());
   for (auto c : str) {
@@ -29,29 +29,29 @@ std::string cleanup_encoding(std::string const& str) {
 // Names inspired by:
 // https://www.iana.org/assignments/character-sets/character-sets.xhtml
 std::unique_ptr<Decoder> pick_decoder_for_encoding(
-    std::string const& encoding, DecoderFactory* factory) {
+    std::string_view encoding, DecoderFactory* factory) {
   auto clean_enc = cleanup_encoding(encoding);
-  if (clean_enc == "utf-8" || clean_enc == "utf8") {
+  if (clean_enc == "utf-8" || clean_enc == "utf8")
     return create_utf8_decoder();
-  }
-  if (clean_enc == "utf-16" || clean_enc == "utf16") {
+
+  if (clean_enc == "utf-16" || clean_enc == "utf16")
     return create_utf16_decoder();
-  }
-  if (clean_enc == "utf-16be" || clean_enc == "utf16be") {
+
+  if (clean_enc == "utf-16be" || clean_enc == "utf16be")
     return create_utf16be_decoder();
-  }
-  if (clean_enc == "utf-16le" || clean_enc == "utf16le") {
+
+  if (clean_enc == "utf-16le" || clean_enc == "utf16le")
     return create_utf16le_decoder();
-  }
-  if (clean_enc == "utf-32" || clean_enc == "utf32") {
+
+  if (clean_enc == "utf-32" || clean_enc == "utf32")
     return create_utf32_decoder();
-  }
-  if (clean_enc == "utf-32be" || clean_enc == "utf32be") {
+
+  if (clean_enc == "utf-32be" || clean_enc == "utf32be")
     return create_utf32be_decoder();
-  }
-  if (clean_enc == "utf-32le" || clean_enc == "utf32le") {
+
+  if (clean_enc == "utf-32le" || clean_enc == "utf32le")
     return create_utf32le_decoder();
-  }
+
   if (clean_enc == "ascii" || clean_enc == "us-ascii" ||
       clean_enc == "usascii" || clean_enc == "iso-ir-6" ||
       clean_enc == "ansi-x3-4-1968" || clean_enc == "ansi-x3-4-1986" ||
@@ -59,9 +59,10 @@ std::unique_ptr<Decoder> pick_decoder_for_encoding(
       clean_enc == "us" || clean_enc == "ibm367" || clean_enc == "cp367") {
     return create_ascii_decoder();
   }
-  if (factory) {
+
+  if (factory)
     return factory->create(encoding);
-  }
+
   return nullptr;
 }
 
diff --git a/sax/src/utils.hh b/sax/src/utils.hh
index 206d003..074f0c0 100644
--- a/sax/src/utils.hh
+++ b/sax/src/utils.hh
@@ -4,7 +4,7 @@
 #include "macros.hh"
 
 #include <memory>
-#include <string>
+#include <string_view>
 
 namespace modxml {
 namespace sax {
@@ -13,7 +13,7 @@ class Decoder;
 class DecoderFactory;
 
 std::unique_ptr<Decoder> HIDDEN pick_decoder_for_encoding(
-    std::string const& encoding,
+    std::string_view encoding,
     DecoderFactory* factory);
 
 }  // namespace sax