#include "guessing_decoder.hh" #include "decoder.hh" #include "sax_decoder.hh" #include "utf8.hh" #include "utf_error.hh" #include using namespace std::string_view_literals; namespace modxml { namespace sax { namespace { bool eq(std::span a, std::size_t& a_offset, std::string_view b) { if (a.size() - a_offset < b.size()) return false; for (size_t i = 0; i < b.size(); ++i) if (a[a_offset + i] != b[i]) return false; return true; } class GuessingDecoder : public Decoder { public: State decode(std::span in, std::size_t& in_offset, std::span out, std::size_t& out_offset) override { assert(in_offset <= in.size()); if (!decided_) { if (eq(in, in_offset, "\xef\xbb\xbf"sv)) { decided_ = create_utf8_decoder(); } else if (eq(in, in_offset, "\xfe\xff\x00\x00"sv)) { in_offset += 4; decided_ = create_utf32be_decoder(); } else if (eq(in, in_offset, "\xfe\xff"sv)) { // Could be UTF-32 BOM, need more data to decide // (note, an xml document encoded in UTF-16 that is less than 4 bytes // is rather impossible). if (in.size() - in_offset < 4) return State::NEED_MORE; in_offset += 2; decided_ = create_utf16be_decoder(); } else if (eq(in, in_offset, "\xff\xfe"sv)) { in_offset += 2; decided_ = create_utf16le_decoder(); } else if (eq(in, in_offset, "\x00\x00\xff\xfe"sv)) { in_offset += 4; decided_ = create_utf32le_decoder(); } else { auto avail = in.size() - in_offset; if (avail == 0) return State::NEED_MORE; if (avail >= 4 && in[in_offset] == 0 && in[in_offset + 1] == 0 && in[in_offset + 2] == 0 && in[in_offset + 3] != 0) { decided_ = create_utf32le_decoder(); } else if (avail >= 4 && in[in_offset] != 0 && in[in_offset + 1] == 0 && in[in_offset + 2] == 0 && in[in_offset + 3] == 0) { decided_ = create_utf32be_decoder(); } else if (avail >= 2 && in[in_offset] == 0 && in[in_offset + 1] != 0) { decided_ = create_utf16le_decoder(); } else if (avail >= 2 && in[in_offset] != 0 && in[in_offset + 1] == 0) { decided_ = create_utf16be_decoder(); } else { auto tmp = in_offset; auto ret = utf::read8(in, tmp); if (ret == utf::NEED_MORE) return State::NEED_MORE; if (ret == utf::INVALID) return State::INVALID; // UTF-8 should be good enough to read the XML declaration. decided_ = create_utf8_decoder(); } } } return decided_->decode(in, in_offset, out, out_offset); } private: std::unique_ptr decided_; }; } // namespace std::unique_ptr create_guessing_decoder() { return std::make_unique(); } } // namespace sax } // namespace modxml