1 files changed, 1089 insertions, 9 deletions
diff --git a/sax/src/sax_processor.cc b/sax/src/sax_processor.cc
index ea9f753..afc9d3b 100644
--- a/sax/src/sax_processor.cc
+++ b/sax/src/sax_processor.cc
@@ -1,18 +1,41 @@
 #include "sax_processor.hh"
 
-#include "sax_decoder.hh"
+#include <iostream>
+
+#include "buffer.hh"
+#include "guessing_decoder.hh"
 #include "processor.hh"
+#include "sax_attributes.hh"
+#include "sax_decoder.hh"
+#include "sax_decoder_factory.hh"
+#include "sax_delegate.hh"
+#include "utf8.hh"
+#include "utf_error.hh"
 #include "utils.hh"
 
 #include <algorithm>
+#include <cassert>
+#include <charconv>
+#include <format>
+#include <map>
 #include <optional>
 #include <utility>
+#include <vector>
+
+using namespace std::string_view_literals;
 
 namespace modxml {
 namespace sax {
 
 namespace {
 
+constexpr std::size_t kDefaultBufferSize = 8192;
+constexpr std::size_t kMinBufferSize = 128;
+
+inline bool is_digit(char c) {
+  return c >= '0' && c <= '9';
+}
+
 // 2.2 Characters
 // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
 
@@ -75,12 +98,185 @@ inline bool is_namechar(uint32_t c) {
       (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040);
 }
 
-/* [5]   	Name	   ::=   	NameStartChar (NameChar)*
+/*
+[5]   	Name	   ::=   	NameStartChar (NameChar)*
 [6]   	Names	   ::=   	Name (#x20 Name)*
 [7]   	Nmtoken	   ::=   	(NameChar)+
 [8]   	Nmtokens	   ::=   	Nmtoken (#x20 Nmtoken)*
 */
 
+inline bool ascii_lowercase(char c) {
+  return (c >= 'A' & c <= 'Z') ? (c | 0x20) : c;
+}
+
+bool eq_lowercase(std::string_view a, std::string_view b) {
+  if (a.size() != b.size())
+    return false;
+  for (std::size_t i = 0; i < a.size(); ++i)
+    if (ascii_lowercase(a[i]) != b[i])
+      return false;
+  return true;
+}
+
+inline std::string_view make_string_view(std::span<uint8_t const> span) {
+  return std::string_view(reinterpret_cast<char const*>(span.data()),
+                          span.size());
+}
+
+class Entities {
+ public:
+  Entities() {
+    data_.emplace("lt", "<");
+    data_.emplace("gt", ">");
+    data_.emplace("amp", "&");
+    data_.emplace("apos", "'");
+    data_.emplace("quot", "\"");
+  }
+
+  std::optional<std::string> get(std::string const& entity) const {
+    if (entity.empty())
+      return std::nullopt;
+    if (entity.front() == '#') {
+      if (entity.size() == 1)
+        return std::nullopt;
+      int base;
+      char const* start;
+      char const* end = entity.data() + entity.size();
+      if (entity[1] == 'x') {
+        start = entity.data() + 2;
+        base = 16;
+      } else {
+        start = entity.data() + 1;
+        base = 10;
+      }
+      uint32_t value;
+      auto [ptr, ec] = std::from_chars(start, end, value, base);
+      if (ec == std::errc() && ptr == end) {
+        uint8_t tmp[4];
+        std::size_t offset = 0;
+        utf::write8(value, tmp, offset);
+        return std::string(reinterpret_cast<char*>(tmp), offset);
+      }
+    }
+    auto it = data_.find(entity);
+    if (it == data_.end())
+      return std::nullopt;
+    return it->second;
+  }
+
+ private:
+  std::map<std::string, std::string> data_;
+};
+
+bool deamp(Entities const& entities, std::string& str, std::size_t last = 0) {
+  while (true) {
+    auto next = str.find('&', last);
+    if (next == std::string::npos)
+      break;
+    next += 1;
+    auto semicolon = str.find(';', next);
+    if (semicolon == std::string::npos)
+      return false;
+    auto replacement = entities.get(str.substr(next, semicolon - next));
+    if (!replacement.has_value())
+      return false;
+  }
+  return true;
+}
+
+std::optional<std::string> unquote(Entities const& entities,
+                                   std::string_view quoted) {
+  assert(quoted.size() >= 2);
+  assert(quoted.front() == quoted.back());
+  std::string ret(quoted.substr(1, quoted.size() - 2));
+  if (deamp(entities, ret))
+    return ret;
+  return std::nullopt;
+}
+
+std::optional<std::string_view> unquote_if_needed(Entities const& entities,
+                                                  std::string_view quoted,
+                                                  std::string& tmp) {
+  assert(quoted.size() >= 2);
+  assert(quoted.front() == quoted.back());
+  auto input = quoted.substr(1, quoted.size() - 2);
+  auto index = input.find('&');
+  if (index == std::string_view::npos)
+    return input;
+  tmp.assign(input);
+  if (deamp(entities, tmp, index))
+    return tmp;
+  return std::nullopt;
+}
+
+class AttributesImpl : public Attributes {
+ public:
+  AttributesImpl() = default;
+
+  bool init(Entities const& entities,
+            std::span<const uint8_t> data,
+            std::vector<size_t> const& offsets,
+            std::size_t first) {
+    std::size_t a = first;
+    attr_.reserve((offsets.size() - first) / 4);
+    while (a + 4 <= offsets.size()) {
+      auto name = make_string_view(data.subspan(offsets[a], offsets[a + 1]));
+      std::string tmp;
+      auto value = unquote_if_needed(
+          entities,
+          make_string_view(data.subspan(offsets[a + 2], offsets[a + 3])),
+          tmp);
+      if (!value.has_value())
+        return false;
+      if (tmp.empty()) {
+        attr_.emplace_back(name, *value);
+      } else {
+        attr_.emplace_back(name, *value, std::move(tmp));
+      }
+      a += 4;
+    }
+    return true;
+  }
+
+  iterator begin() const override {
+    return Iterator(this, 0);
+  }
+
+  iterator end() const override {
+    return Iterator(this, attr_.size());
+  }
+
+  std::size_t size() const override {
+    return attr_.size();
+  }
+
+  Attribute const& at(std::size_t index) const override {
+    return attr_[index];
+  }
+
+ private:
+  class Iterator : public iterator {
+   public:
+    Iterator(Attributes const* attributes, std::size_t index)
+        : iterator(attributes, index) {}
+  };
+
+  struct AttributeImpl : public Attribute {
+    AttributeImpl(std::string_view name, std::string_view value)
+        : Attribute(name, value) {}
+
+    AttributeImpl(std::string_view name, std::string_view value,
+                  std::string&& tmp)
+        : Attribute(name, value), tmp_(std::move(tmp)) {}
+
+   private:
+    std::string tmp_;
+  };
+
+  std::span<const uint8_t> data_;
+  std::vector<AttributeImpl> attr_;
+};
+
 class ProcessorImpl : public Processor {
  public:
   ProcessorImpl(std::shared_ptr<Delegate> delegate,
@@ -91,15 +287,898 @@ class ProcessorImpl : public Processor {
       : delegate_(std::move(delegate)),
         decoder_factory_(std::move(decoder_factory)),
         decoder_(std::move(decoder)),
-        default_buffer_size_(default_buffer_size),
-        max_buffer_size_(max_buffer_size) {}
+        forced_decoder_(decoder_),
+        buffer_(make_buffer(default_buffer_size, max_buffer_size)) {
+    if (!decoder_)
+      decoder_ = create_guessing_decoder();
+
+    expect_document();
+  }
+
+  std::size_t process(std::span<uint8_t const> data,
+                      std::size_t offset) override {
+    cmds_.emplace_back(Command::FILL_BUFFER, Count::ZERO_OR_ONE);
+
+    std::size_t consumed = 0;
+
+    while (true) {
+      if (cmds_.empty()) {
+        if (!buffer_->empty()) {
+          std::cerr << make_string_view(buffer_->rspan()) << std::endl;
+          delegate_->error("Extra data at end");
+        }
+        return consumed;
+      }
+
+      auto current = cmds_.back();
+      auto const old_size = cmds_.size();
+      cmds_.pop_back();
+      Process ret;
+      switch (current.command) {
+        case Command::FILL_BUFFER:
+          ret = fill_buffer(data, offset, consumed);
+          break;
+        case Command::MISC:
+          ret = process_misc(current);
+          break;
+        case Command::SPACE:
+          ret = process_space(current);
+          break;
+        case Command::ELEMENT:
+          ret = process_element(current);
+          break;
+        case Command::COMMENT:
+          ret = process_comment(current);
+          break;
+        case Command::PROCESSING_INSTRUCTION:
+          ret = process_processing_instruction(current);
+          break;
+        case Command::XMLDECL:
+          ret = process_xmldecl(current);
+          break;
+        case Command::ATTRIBUTE:
+          ret = process_attribute(current);
+          break;
+        case Command::NAME:
+          ret = process_name(current);
+          break;
+        case Command::ATTRIBUTE_VALUE:
+          ret = process_attribute_value(current);
+          break;
+        case Command::EQUAL:
+          ret = process_equal(current);
+          break;
+        case Command::START_OR_EMPTY_TAG:
+          ret = process_start_or_empty_tag(current);
+          break;
+        case Command::END_TAG:
+          ret = process_end_tag(current);
+          break;
+      }
+
+      switch (ret) {
+        case Process::NEED_MORE:
+        case Process::ERROR:
+          cmds_.push_back(current);
+          assert(cmds_.size() == old_size);
+          return consumed;
+        case Process::CONTINUE:
+          break;
+      }
+    }
+  }
+
+  uint64_t line() const override { return line_; }
+
+  uint64_t column() const override { return column_; }
 
  private:
+  enum class Process {
+    NEED_MORE,
+    ERROR,
+    CONTINUE,
+  };
+
+  enum class Match {
+    FULL_MATCH,
+    PARTIAL_MATCH,
+    NO_MATCH,
+  };
+
+  enum class Command {
+    FILL_BUFFER,
+
+    ATTRIBUTE,
+    ATTRIBUTE_VALUE,
+    COMMENT,
+    ELEMENT,
+    END_TAG,
+    EQUAL,
+    MISC,
+    NAME,
+    PROCESSING_INSTRUCTION,
+    SPACE,
+    START_OR_EMPTY_TAG,
+    XMLDECL,
+  };
+
+  enum class Count {
+    ONE,
+    ONE_OR_MANY,
+    ZERO_OR_ONE,
+    ZERO_OR_MANY,
+  };
+
+  struct CommandItem {
+    Command const command;
+    Count const count;
+    std::size_t offset;
+
+    CommandItem(Command command, Count count, std::size_t offset = 0)
+        : command(command), count(count), offset(offset) {}
+  };
+
+  struct StackItem {
+    std::vector<std::size_t> offsets;
+  };
+
+  Process fill_buffer(std::span<uint8_t const> data,
+                      std::size_t offset,
+                      std::size_t& consumed) {
+    if (offset >= data.size())
+      return Process::NEED_MORE;
+
+    std::size_t tmp = offset;
+    auto wspan = buffer_->wspan(4);
+    switch (decoder_->decode(data, tmp, wspan, consumed)) {
+      case Decoder::State::GOOD:
+        break;
+      case Decoder::State::NEED_MORE:
+        return Process::NEED_MORE;
+      case Decoder::State::INVALID:
+        delegate_->error("Invalid data");
+        return Process::ERROR;
+    }
+    buffer_->commit(consumed);
+    return Process::CONTINUE;
+  }
+
+  void expect_document() {
+    // document := prolog element Misc*
+    expect_misc(Count::ZERO_OR_MANY);
+    expect_element(Count::ONE);
+    expect_prolog();
+  }
+
+  void expect_misc(Count count) {
+    cmds_.emplace_back(Command::MISC, count);
+  }
+
+  void expect_element(Count count) {
+    // element ::= EmptyElemTag | STag content ETag
+    cmds_.emplace_back(Command::START_OR_EMPTY_TAG, count);
+  }
+
+  void expect_end_tag(Count count) {
+    cmds_.emplace_back(Command::END_TAG, count);
+  }
+
+  void expect_prolog() {
+    // prolog := XMLDecl? Misc* (doctypedecl Misc*)?
+    expect_misc(Count::ZERO_OR_MANY);
+    expect_doctypedecl(Count::ZERO_OR_ONE);
+    expect_misc(Count::ZERO_OR_MANY);
+    expect_xmldecl(Count::ZERO_OR_ONE);
+  }
+
+  void expect_xmldecl(Count count) {
+    cmds_.emplace_back(Command::XMLDECL, count);
+  }
+
+  void expect_doctypedecl(Count) {
+    // TODO
+  }
+
+  void expect_comment(Count count, std::size_t start_offset = 0) {
+    // Comment should never be more than one, should be MISC that is repeated.
+    assert(count == Count::ONE);
+    cmds_.emplace_back(Command::COMMENT, count, start_offset);
+  }
+
+  void expect_content(Count) {
+    // TODO
+  }
+
+  void expect_pi(Count count, std::size_t start_offset = 0) {
+    // PI should never be more than one, should be MISC that is repeated.
+    assert(count == Count::ONE);
+    cmds_.emplace_back(Command::PROCESSING_INSTRUCTION, count, start_offset);
+  }
+
+  void expect_space(Count count) {
+    // There is not way to have SS as S is continous, so we should never
+    // ask for more than one or zero.
+    assert(count == Count::ZERO_OR_ONE || count == Count::ONE);
+    cmds_.emplace_back(Command::SPACE, count);
+  }
+
+  void expect_attribute(Count count) {
+    switch (count) {
+      case Count::ONE_OR_MANY:
+        cmds_.emplace_back(Command::ATTRIBUTE, Count::ZERO_OR_MANY);
+      case Count::ONE:
+        // Attribute ::= Name Eq AttValue
+        expect_attribute_value(Count::ONE);
+        expect_equal(Count::ONE);
+        expect_name(Count::ONE);
+        expect_space(Count::ONE);
+        break;
+      case Count::ZERO_OR_ONE:
+      case Count::ZERO_OR_MANY:
+        cmds_.emplace_back(Command::ATTRIBUTE, count);
+        break;
+    }
+  }
+
+  void expect_attribute_value(Count count) {
+    cmds_.emplace_back(Command::ATTRIBUTE_VALUE, count);
+  }
+
+  void expect_equal(Count count) {
+    // Eq ::= S? '=' S?
+    expect_space(Count::ZERO_OR_ONE);
+    cmds_.emplace_back(Command::EQUAL, count);
+    expect_space(Count::ZERO_OR_ONE);
+  }
+
+  void expect_name(Count count) {
+    cmds_.emplace_back(Command::NAME, count);
+  }
+
+  Process process_misc(CommandItem const& item) {
+    // Misc := Comment | PI | S
+    assert(item.offset == 0);
+
+    switch (match("<!--")) {
+      case Match::FULL_MATCH:
+        add_if_more(item);
+        expect_comment(Count::ONE, 3);
+        return Process::CONTINUE;
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+      case Match::NO_MATCH:
+        break;
+    }
+
+    switch (match("<?")) {
+      case Match::FULL_MATCH:
+        add_if_more(item);
+        expect_pi(Count::ONE, 2);
+        return Process::CONTINUE;
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+      case Match::NO_MATCH:
+        break;
+    }
+
+    switch (match_s()) {
+      case Match::FULL_MATCH:
+        add_if_more(item);
+        expect_space(Count::ONE);
+        return Process::CONTINUE;
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+      case Match::NO_MATCH:
+        break;
+    }
+
+    return no_match(item);
+  }
+
+  Process process_attribute(CommandItem& item) {
+    // This actually parses (S Attribute)* when followed by S?
+    // for Attribute parsing see expect_attribute()
+    // So we need to figure out if the S means start of attribute
+    // or just an S. We do this by checking if the first non-S is
+    // a namestart or something else. We consume the S.
+    uint32_t last_char;
+    auto ret = consume_space(item.offset, last_char);
+    if (ret != Process::CONTINUE)
+      return ret;
+
+    // No S, cannot be followed by an attribute then.
+    if (item.offset == 0)
+      return no_match(item);
+
+    // First character after S isn't a valid first character of a name,
+    // cannot be followed by an attribute then.
+    if (!is_namestartchar(last_char))
+      return no_match(item);
+
+    expect_attribute_value(Count::ONE);
+    expect_equal(Count::ONE);
+    expect_name(Count::ONE);
+    return Process::CONTINUE;
+  }
+
+  Process process_equal(CommandItem const& item) {
+    // Eq ::= S? '=' S?
+    // Spacing added by expect_equal
+    switch (match_consume("=")) {
+      case Match::FULL_MATCH:
+        add_if_more(item);
+        return Process::CONTINUE;
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+      case Match::NO_MATCH:
+        return no_match(item);
+    }
+  }
+
+  Process process_name(CommandItem& item) {
+	// Name ::= NameStartChar (NameChar)*
+    auto data = buffer_->rspan(item.offset + 4);
+    while (true) {
+      std::size_t tmp = item.offset;
+      auto c = utf::read8(data, tmp);
+      if (c == utf::NEED_MORE)
+        return Process::NEED_MORE;
+      if (c == utf::INVALID || !valid_char(c))
+        return invalid_char(data, tmp);
+      if (item.offset == 0) {
+        if (!is_namestartchar(c))
+          return no_match(item);
+      } else {
+        if (!is_namechar(c))
+          break;
+      }
+      item.offset = tmp;
+    }
+
+    assert(!stack_.empty());
+    auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get());
+    stack_.back().offsets.push_back(read_view->consumed());
+    stack_.back().offsets.push_back(item.offset);
+    buffer_->consume(item.offset);
+    return Process::CONTINUE;
+  }
+
+  Process process_attribute_value(CommandItem& item) {
+    // AttValue ::= '"' ([^<&"] | Reference)* '"'
+    //              | "'" ([^<&'] | Reference)* "'"
+
+    uint32_t end_char;
+    auto data = buffer_->rspan(item.offset + 4);
+
+    if (item.offset == 0) {
+      std::size_t tmp = item.offset;
+      auto c = utf::read8(data, tmp);
+      if (c == utf::NEED_MORE)
+        return Process::NEED_MORE;
+      if (c == utf::INVALID || !valid_char(c))
+        return invalid_char(data, tmp);
+      if (c != '"' && c != '\'')
+        return no_match(item);
+      item.offset = tmp;
+      end_char = c;
+    } else {
+      assert(!data.empty());
+      end_char = data[0];  // ok as both " and ' are ASCII
+    }
+
+    while (true) {
+      auto c = utf::read8(data, item.offset);
+      if (c == utf::NEED_MORE)
+        return Process::NEED_MORE;
+      if (c == utf::INVALID || !valid_char(c))
+        return invalid_char(data, item.offset);
+      if (c == end_char)
+        break;
+      // TODO: Should we validate reference already here or do we let
+      // unquoute take care of that? As Reference can't contain end_char
+      // only checking for end_char is safe here.
+    }
+
+    assert(!stack_.empty());
+    auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get());
+    stack_.back().offsets.push_back(read_view->consumed());
+    stack_.back().offsets.push_back(item.offset);
+    buffer_->consume(item.offset);
+    return Process::CONTINUE;
+  }
+
+  Process process_comment(CommandItem& item) {
+    if (item.offset == 0) {
+      switch (match_consume("<!--")) {
+        case Match::FULL_MATCH:
+          item.offset += 3;
+          break;
+        case Match::PARTIAL_MATCH:
+          return Process::NEED_MORE;
+        case Match::NO_MATCH:
+          return no_match(item);
+      }
+    }
+
+    auto match = find("-->", item.offset);
+    switch (match) {
+      case Match::FULL_MATCH: {
+        auto data = buffer_->rspan(item.offset);
+        assert(data.size() >= item.offset);
+        delegate_->comment(
+            make_string_view(data.subspan(3, item.offset - 3)));
+        buffer_->consume(item.offset + 3);
+        return Process::CONTINUE;
+      }
+      case Match::NO_MATCH:
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+    }
+  }
+
+  Process process_processing_instruction(CommandItem& item) {
+    if (item.offset == 0) {
+      switch (match_consume("<?")) {
+        case Match::FULL_MATCH:
+          item.offset += 2;
+          break;
+        case Match::PARTIAL_MATCH:
+          return Process::NEED_MORE;
+        case Match::NO_MATCH:
+          return no_match(item);
+      }
+    }
+
+    // TODO
+    delegate_->error("PI not supported");
+    return Process::ERROR;
+  }
+
+  void add_to_stack(CommandItem const& item, std::size_t offset) {
+    cmds_.emplace_back(item.command, item.count, offset);
+    stack_.emplace_back();
+    buffer_ = make_read_view_buffer(std::move(buffer_));
+    buffer_->consume(offset);
+  }
+
+  std::size_t pop_stack(std::vector<std::size_t>& attr) {
+    assert(!stack_.empty());
+    std::swap(attr, stack_.back().offsets);
+
+    auto* read_view = static_cast<ReadViewBuffer*>(buffer_.get());
+    auto consumed = read_view->consumed();
+
+    buffer_ = read_view->release();
+    stack_.pop_back();
+
+    return consumed;
+  }
+
+  Process process_xmldecl(CommandItem const& item) {
+    // XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
+    if (item.offset == 0) {
+      switch (match("<?xml")) {
+        case Match::FULL_MATCH:
+          add_to_stack(item, /* offset */ 5);
+          expect_space(Count::ZERO_OR_ONE);
+          // Parsing as generic "Attribute" here and doing validation later.
+          expect_attribute(Count::ONE_OR_MANY);
+          return Process::CONTINUE;
+        case Match::PARTIAL_MATCH:
+          return Process::NEED_MORE;
+        case Match::NO_MATCH:
+          return no_match(item);
+      }
+    }
+
+    assert(item.offset == 5);
+
+    // Remember that this is still reading for the read view buffer.
+    switch (match_consume("?>")) {
+      case Match::FULL_MATCH:
+        break;
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+      case Match::NO_MATCH:
+        delegate_->error(std::format("Expected end of {}",
+                                     command_name(item.command)));
+        return Process::ERROR;
+    }
+
+    std::vector<std::size_t> attr;
+    auto const consumed = pop_stack(attr);
+
+    // Now we're back to the real buffer
+    auto data = buffer_->rspan(consumed);
+    std::size_t a = 0;
+
+    if (a + 4 <= attr.size() &&
+        make_string_view(data.subspan(attr[a + 0],
+                                      attr[a + 1])) == "version") {
+      auto version = make_string_view(data.subspan(attr[a + 2] + 1,
+                                                   attr[a + 3] - 2));
+      if (!valid_version(version)) {
+        delegate_->error(std::format("Unsupported xmldecl version, {}",
+                                     version));
+        return Process::ERROR;
+      }
+      a += 4;
+    } else {
+      // No version
+      delegate_->error("Invalid xmldecl, must have a version attribute first.");
+      return Process::ERROR;
+    }
+
+    if (a + 4 <= attr.size() &&
+        make_string_view(data.subspan(attr[a + 0],
+                                      attr[a + 1])) == "encoding") {
+      auto encoding = make_string_view(data.subspan(attr[a + 2] + 1,
+                                                    attr[a + 3] - 2));
+      if (forced_decoder_) {
+        // encoding value is ignored
+        // TODO: Should we check that it is valid anyway?
+      } else {
+        auto decoder = pick_decoder_for_encoding(encoding, nullptr);
+        if (!decoder && decoder_factory_)
+          decoder = decoder_factory_->create(encoding);
+        if (!decoder) {
+          delegate_->error(std::format("Unknown encoding {}", encoding));
+          return Process::ERROR;
+        }
+        std::swap(decoder_, decoder);
+        // TODO: Re-decode the rest of the buffer?
+      }
+      a += 4;
+    }
+
+    if (a + 4 <= attr.size() &&
+        make_string_view(data.subspan(attr[a + 0],
+                                      attr[a + 1])) == "standalone") {
+      auto sd = make_string_view(data.subspan(attr[a + 2] + 1,
+                                              attr[a + 3] - 2));
+      if (sd == "yes") {
+        // TODO: Handle standalone == yes
+      } else if (sd == "no") {
+        // TODO: Handle standalone == no
+      } else {
+        delegate_->error(std::format(
+            "Invalid xmldecl, standalone attribute has unsupported value, {}",
+            sd));
+        return Process::ERROR;
+      }
+      a += 4;
+    }
+
+    if (a < attr.size()) {
+      delegate_->error(
+          std::format("Invalid xmldecl, unknown attribute, {}",
+                      make_string_view(data.subspan(attr[a + 0],
+                                                    attr[a + 1]))));
+      return Process::ERROR;
+    }
+
+    buffer_->consume(consumed);
+    return Process::CONTINUE;
+  }
+
+  Process process_start_or_empty_tag(CommandItem const& item) {
+    // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
+    // STag         ::=	'<' Name (S Attribute)* S? '>'
+    if (item.offset == 0) {
+      switch (match("<")) {
+        case Match::FULL_MATCH:
+          add_to_stack(item, /* offset */ 1);
+          expect_space(Count::ZERO_OR_ONE);
+          expect_attribute(Count::ZERO_OR_MANY);
+          expect_name(Count::ONE);
+          return Process::CONTINUE;
+        case Match::PARTIAL_MATCH:
+          return Process::NEED_MORE;
+        case Match::NO_MATCH:
+          return no_match(item);
+      }
+    }
+
+    assert(item.offset == 1);
+
+    bool empty_tag;
+
+    // Remember that this is still reading for the read view buffer.
+    switch (match_consume("/>")) {
+      case Match::FULL_MATCH:
+        empty_tag = true;
+        break;
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+      case Match::NO_MATCH:
+        switch (match_consume(">")) {
+          case Match::FULL_MATCH:
+            empty_tag = false;
+            break;
+          case Match::PARTIAL_MATCH:
+            return Process::NEED_MORE;
+          case Match::NO_MATCH:
+            delegate_->error(std::format("Expected end of {}",
+                                         command_name(item.command)));
+            return Process::ERROR;
+        }
+        break;
+    }
+
+    std::vector<std::size_t> attr;
+    auto const consumed = pop_stack(attr);
+
+    // Now we're back to the real buffer
+    auto data = buffer_->rspan(consumed);
+
+    assert(attr.size() >= 2);
+    auto name = make_string_view(data.subspan(attr[0], attr[1]));
+
+    AttributesImpl attributes;
+    if (!attributes.init(entities_, data, std::move(attr), 2)) {
+      delegate_->error("Invalid references in attribute values");
+      return Process::ERROR;
+    }
+
+    add_if_more(item);
+
+    if (empty_tag) {
+      delegate_->empty_element(name, attributes);
+    } else {
+      delegate_->start_element(name, attributes);
+      expect_end_tag(Count::ONE);
+      expect_content(Count::ONE);
+    }
+
+    buffer_->consume(consumed);
+    return Process::CONTINUE;
+  }
+
+  Process process_end_tag(CommandItem const& item) {
+    // ETag ::=	'</' Name S? '>'
+    if (item.offset == 0) {
+      switch (match("</")) {
+        case Match::FULL_MATCH:
+          add_to_stack(item, /* offset */ 2);
+          expect_space(Count::ZERO_OR_ONE);
+          expect_name(Count::ONE);
+          return Process::CONTINUE;
+        case Match::PARTIAL_MATCH:
+          return Process::NEED_MORE;
+        case Match::NO_MATCH:
+          return no_match(item);
+      }
+    }
+
+    assert(item.offset == 1);
+
+    // Remember that this is still reading for the read view buffer.
+    switch (match_consume(">")) {
+      case Match::FULL_MATCH:
+        break;
+      case Match::PARTIAL_MATCH:
+        return Process::NEED_MORE;
+      case Match::NO_MATCH:
+        delegate_->error(std::format("Expected end of {}",
+                                     command_name(item.command)));
+        return Process::ERROR;
+    }
+
+    std::vector<std::size_t> attr;
+    auto const consumed = pop_stack(attr);
+
+    // Now we're back to the real buffer
+    auto data = buffer_->rspan(consumed);
+
+    assert(attr.size() == 2);
+    auto name = make_string_view(data.subspan(attr[0], attr[1]));
+
+    add_if_more(item);
+
+    delegate_->end_element(name);
+
+    buffer_->consume(consumed);
+    return Process::CONTINUE;
+  }
+
+  static bool valid_version(std::string_view version) {
+    if (version.size() < 3)
+      return false;
+    if (!version.starts_with("1."))
+      return false;
+    for (std::size_t i = 2; i < version.size(); ++i) {
+      if (!is_digit(version[i]))
+        return false;
+    }
+    return true;
+  }
+
+  Process process_element(CommandItem& item) {
+    // TODO
+    delegate_->error("Element is not yet supported");
+    return Process::ERROR;
+  }
+
+  Process consume_space(std::size_t& count, uint32_t& last_char) {
+    auto data = buffer_->rspan(4);
+    std::size_t consumed = 0;
+    while (true) {
+      std::size_t offset = consumed;
+      auto c = utf::read8(data, offset);
+      if (c == utf::NEED_MORE) {
+        buffer_->consume(consumed);
+        return Process::NEED_MORE;
+      }
+      if (c == utf::INVALID || !valid_char(c))
+        return invalid_char(data, offset);
+      if (!is_ws(c)) {
+        last_char = c;
+        buffer_->consume(consumed);
+        return Process::CONTINUE;
+      }
+      ++count;
+      handle_ws(c);
+      consumed = offset;
+    }
+  }
+
+  Process process_space(CommandItem& item) {
+    // S ::= (#x20 | #x9 | #xD | #xA)+
+    // item.offset is only used to count spaces. We consume each space as it
+    // is found so no offset in buffer.
+    uint32_t unused;
+    auto ret = consume_space(item.offset, unused);
+    if (ret != Process::CONTINUE)
+      return ret;
+
+    if (item.offset == 0)
+      return no_match(item);
+
+    add_if_more(item);
+    return Process::CONTINUE;
+  }
+
+  void add_if_more(CommandItem const& item) {
+    switch (item.count) {
+      case Count::ONE:
+        break;
+      case Count::ONE_OR_MANY:
+        cmds_.emplace_back(item.command, Count::ZERO_OR_MANY);
+        break;
+      case Count::ZERO_OR_ONE:
+        break;
+      case Count::ZERO_OR_MANY:
+        cmds_.emplace_back(item.command, item.count);
+    }
+  }
+
+  Match find(std::string_view str, std::size_t& offset) {
+    auto data = buffer_->rspan(offset + str.size());
+    std::size_t i = 0;
+    while (offset < data.size()) {
+      if (str[i] == data[offset]) {
+        ++i;
+        if (i == str.size()) {
+          offset -= i;
+          return Match::FULL_MATCH;
+        }
+      } else {
+        i = 0;
+      }
+      ++offset;
+    }
+    if (i > 0) {
+      offset -= i;
+      return Match::PARTIAL_MATCH;
+    }
+    return Match::NO_MATCH;
+  }
+
+  Match match(std::string_view str, std::size_t offset = 0) {
+    auto data = buffer_->rspan(offset + str.size());
+    if (data.size() <= offset)
+      return Match::PARTIAL_MATCH;
+    auto const avail = std::min(str.size(), data.size() - offset);
+    for (std::size_t i = 0; i < avail; ++i) {
+      if (str[i] != data[offset + i])
+        return Match::NO_MATCH;
+    }
+    if (avail < str.size())
+      return Match::PARTIAL_MATCH;
+    return Match::FULL_MATCH;
+  }
+
+  Match match_consume(std::string_view str) {
+    auto ret = match(str);
+    if (ret == Match::FULL_MATCH)
+      buffer_->consume(str.size());
+    return ret;
+  }
+
+  Match match_s() {
+    auto data = buffer_->rspan(4);
+    std::size_t offset = 0;
+    auto c = utf::read8(data, offset);
+    if (c == utf::NEED_MORE)
+      return data.size() == 0 ? Match::PARTIAL_MATCH : Match::NO_MATCH;
+    if (c == utf::INVALID)
+      return Match::NO_MATCH;
+    if (!valid_char(c) || !is_ws(c))
+      return Match::NO_MATCH;
+    return Match::FULL_MATCH;
+  }
+
+  Process no_match(CommandItem const& item) {
+    switch (item.count) {
+      case Count::ONE:
+      case Count::ONE_OR_MANY:
+        delegate_->error(std::format("Expected {}",
+                                     command_name(item.command)));
+        return Process::ERROR;
+      case Count::ZERO_OR_ONE:
+      case Count::ZERO_OR_MANY:
+        break;
+    }
+    return Process::CONTINUE;
+  }
+
+  void handle_ws(uint32_t c) {
+    if (c == '\n') {
+      ++line_;
+      column_ = 0;
+    } else {
+      ++column_;
+    }
+  }
+
+  Process invalid_char(std::span<uint8_t const> data, std::size_t offset) {
+    delegate_->error(std::format("Invalid char {:02x}", data[offset]));
+    return Process::ERROR;
+  }
+
+  static std::string_view command_name(Command command) {
+    switch (command) {
+      case Command::MISC:
+        return "misc"sv;
+      case Command::FILL_BUFFER:
+        return "more data"sv;
+      case Command::ELEMENT:
+        return "element"sv;
+      case Command::SPACE:
+        return "whitespace"sv;
+      case Command::COMMENT:
+        return "comment"sv;
+      case Command::PROCESSING_INSTRUCTION:
+        return "processing instruction"sv;
+      case Command::XMLDECL:
+        return "xml declaration"sv;
+      case Command::ATTRIBUTE:
+        return "attribute"sv;
+      case Command::ATTRIBUTE_VALUE:
+        return "attribute value"sv;
+      case Command::NAME:
+        return "name"sv;
+      case Command::EQUAL:
+        return "equal sign (=)"sv;
+      case Command::START_OR_EMPTY_TAG:
+        return "element"sv;
+      case Command::END_TAG:
+        return "end tag"sv;
+    }
+    assert(false);
+    return {};
+  }
+
   std::shared_ptr<Delegate> delegate_;
   std::shared_ptr<DecoderFactory> decoder_factory_;
   std::unique_ptr<Decoder> decoder_;
-  std::size_t default_buffer_size_;
-  std::size_t max_buffer_size_;
+  bool const forced_decoder_;
+  std::unique_ptr<Buffer> buffer_;
+  Entities entities_;
+  std::vector<CommandItem> cmds_;
+  std::vector<StackItem> stack_;
+  uint64_t line_{1};
+  uint64_t column_{0};
 };
 
 }  // namespace
@@ -117,9 +1196,9 @@ std::unique_ptr<Processor> create_processor(
                                         decoder_factory.get());
   }
 
-  std::size_t default_buffer_size = 8192;
+  std::size_t default_buffer_size = kDefaultBufferSize;
   if (opt_default_buffer_size.has_value())
-    default_buffer_size = std::max(static_cast<std::size_t>(128),
+    default_buffer_size = std::max(kMinBufferSize,
                                    opt_default_buffer_size.value());
   // This value is documented in public headers. Do NOT change.
   std::size_t max_buffer_size = 10 * 1024 * 1024;
@@ -136,7 +1215,8 @@ std::unique_ptr<Processor> create_processor(
                                          max_buffer_size);
 }
 
-std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate) {
+std::unique_ptr<Processor>
+Processor::create(std::shared_ptr<Delegate> delegate) {
   return create_processor(std::move(delegate), nullptr,
                           std::nullopt, std::nullopt, std::nullopt);
 }