diff options
Diffstat (limited to 'sax/src/guessing_decoder.cc')
| -rw-r--r-- | sax/src/guessing_decoder.cc | 92 |
1 files changed, 92 insertions, 0 deletions
diff --git a/sax/src/guessing_decoder.cc b/sax/src/guessing_decoder.cc new file mode 100644 index 0000000..e72dab3 --- /dev/null +++ b/sax/src/guessing_decoder.cc @@ -0,0 +1,92 @@ +#include "guessing_decoder.hh" + +#include "decoder.hh" +#include "sax_decoder.hh" +#include "utf8.hh" +#include "utf_error.hh" + +#include <cassert> + +using namespace std::string_view_literals; + +namespace modxml { +namespace sax { + +namespace { + +bool eq(std::span<uint8_t const> a, std::size_t& a_offset, std::string_view b) { + if (a.size() - a_offset < b.size()) + return false; + for (size_t i = 0; i < b.size(); ++i) + if (a[a_offset + i] != b[i]) + return false; + return true; +} + +class GuessingDecoder : public Decoder { + public: + State decode(std::span<uint8_t const> in, std::size_t& in_offset, + std::span<uint8_t> out, std::size_t& out_offset) override { + assert(in_offset <= in.size()); + + if (!decided_) { + if (eq(in, in_offset, "\xef\xbb\xbf"sv)) { + decided_ = create_utf8_decoder(); + } else if (eq(in, in_offset, "\xfe\xff\x00\x00"sv)) { + in_offset += 4; + decided_ = create_utf32be_decoder(); + } else if (eq(in, in_offset, "\xfe\xff"sv)) { + // Could be UTF-32 BOM, need more data to decide + // (note, an xml document encoded in UTF-16 that is less than 4 bytes + // is rather impossible). + if (in.size() - in_offset < 4) + return State::NEED_MORE; + in_offset += 2; + decided_ = create_utf16be_decoder(); + } else if (eq(in, in_offset, "\xff\xfe"sv)) { + in_offset += 2; + decided_ = create_utf16le_decoder(); + } else if (eq(in, in_offset, "\x00\x00\xff\xfe"sv)) { + in_offset += 4; + decided_ = create_utf32le_decoder(); + } else { + auto avail = in.size() - in_offset; + if (avail == 0) + return State::NEED_MORE; + if (avail >= 4 && in[in_offset] == 0 && in[in_offset + 1] == 0 + && in[in_offset + 2] == 0 && in[in_offset + 3] != 0) { + decided_ = create_utf32le_decoder(); + } else if (avail >= 4 && in[in_offset] != 0 && in[in_offset + 1] == 0 + && in[in_offset + 2] == 0 && in[in_offset + 3] == 0) { + decided_ = create_utf32be_decoder(); + } else if (avail >= 2 && in[in_offset] == 0 && in[in_offset + 1] != 0) { + decided_ = create_utf16le_decoder(); + } else if (avail >= 2 && in[in_offset] != 0 && in[in_offset + 1] == 0) { + decided_ = create_utf16be_decoder(); + } else { + auto tmp = in_offset; + auto ret = utf::read8(in, tmp); + if (ret == utf::NEED_MORE) + return State::NEED_MORE; + if (ret == utf::INVALID) + return State::INVALID; + // UTF-8 should be good enough to read the XML declaration. + decided_ = create_utf8_decoder(); + } + } + } + return decided_->decode(in, in_offset, out, out_offset); + } + + private: + std::unique_ptr<Decoder> decided_; +}; + +} // namespace + +std::unique_ptr<Decoder> create_guessing_decoder() { + return std::make_unique<GuessingDecoder>(); +} + +} // namespace sax +} // namespace modxml |
