summaryrefslogtreecommitdiff
path: root/sax/src/guessing_decoder.cc
diff options
context:
space:
mode:
Diffstat (limited to 'sax/src/guessing_decoder.cc')
-rw-r--r--sax/src/guessing_decoder.cc92
1 files changed, 92 insertions, 0 deletions
diff --git a/sax/src/guessing_decoder.cc b/sax/src/guessing_decoder.cc
new file mode 100644
index 0000000..e72dab3
--- /dev/null
+++ b/sax/src/guessing_decoder.cc
@@ -0,0 +1,92 @@
+#include "guessing_decoder.hh"
+
+#include "decoder.hh"
+#include "sax_decoder.hh"
+#include "utf8.hh"
+#include "utf_error.hh"
+
+#include <cassert>
+
+using namespace std::string_view_literals;
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+bool eq(std::span<uint8_t const> a, std::size_t& a_offset, std::string_view b) {
+ if (a.size() - a_offset < b.size())
+ return false;
+ for (size_t i = 0; i < b.size(); ++i)
+ if (a[a_offset + i] != b[i])
+ return false;
+ return true;
+}
+
+class GuessingDecoder : public Decoder {
+ public:
+ State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+ std::span<uint8_t> out, std::size_t& out_offset) override {
+ assert(in_offset <= in.size());
+
+ if (!decided_) {
+ if (eq(in, in_offset, "\xef\xbb\xbf"sv)) {
+ decided_ = create_utf8_decoder();
+ } else if (eq(in, in_offset, "\xfe\xff\x00\x00"sv)) {
+ in_offset += 4;
+ decided_ = create_utf32be_decoder();
+ } else if (eq(in, in_offset, "\xfe\xff"sv)) {
+ // Could be UTF-32 BOM, need more data to decide
+ // (note, an xml document encoded in UTF-16 that is less than 4 bytes
+ // is rather impossible).
+ if (in.size() - in_offset < 4)
+ return State::NEED_MORE;
+ in_offset += 2;
+ decided_ = create_utf16be_decoder();
+ } else if (eq(in, in_offset, "\xff\xfe"sv)) {
+ in_offset += 2;
+ decided_ = create_utf16le_decoder();
+ } else if (eq(in, in_offset, "\x00\x00\xff\xfe"sv)) {
+ in_offset += 4;
+ decided_ = create_utf32le_decoder();
+ } else {
+ auto avail = in.size() - in_offset;
+ if (avail == 0)
+ return State::NEED_MORE;
+ if (avail >= 4 && in[in_offset] == 0 && in[in_offset + 1] == 0
+ && in[in_offset + 2] == 0 && in[in_offset + 3] != 0) {
+ decided_ = create_utf32le_decoder();
+ } else if (avail >= 4 && in[in_offset] != 0 && in[in_offset + 1] == 0
+ && in[in_offset + 2] == 0 && in[in_offset + 3] == 0) {
+ decided_ = create_utf32be_decoder();
+ } else if (avail >= 2 && in[in_offset] == 0 && in[in_offset + 1] != 0) {
+ decided_ = create_utf16le_decoder();
+ } else if (avail >= 2 && in[in_offset] != 0 && in[in_offset + 1] == 0) {
+ decided_ = create_utf16be_decoder();
+ } else {
+ auto tmp = in_offset;
+ auto ret = utf::read8(in, tmp);
+ if (ret == utf::NEED_MORE)
+ return State::NEED_MORE;
+ if (ret == utf::INVALID)
+ return State::INVALID;
+ // UTF-8 should be good enough to read the XML declaration.
+ decided_ = create_utf8_decoder();
+ }
+ }
+ }
+ return decided_->decode(in, in_offset, out, out_offset);
+ }
+
+ private:
+ std::unique_ptr<Decoder> decided_;
+};
+
+} // namespace
+
+std::unique_ptr<Decoder> create_guessing_decoder() {
+ return std::make_unique<GuessingDecoder>();
+}
+
+} // namespace sax
+} // namespace modxml