summaryrefslogtreecommitdiff
path: root/sax/src/guessing_decoder.cc
blob: 0e3b628263c82a7eafb96d89b0c73e84c42e3d96 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#include "guessing_decoder.hh"

#include "decoder.hh"
#include "sax_decoder.hh"
#include "utf8.hh"
#include "utf_error.hh"

#include <cassert>

using namespace std::string_view_literals;

namespace modxml {
namespace sax {

namespace {

bool eq(std::span<uint8_t const> a, std::size_t& a_offset, std::string_view b) {
  if (a.size() - a_offset < b.size())
    return false;
  for (size_t i = 0; i < b.size(); ++i)
    if (a[a_offset + i] != static_cast<uint8_t>(b[i]))
      return false;
  return true;
}

class GuessingDecoder : public Decoder {
 public:
  State decode(std::span<uint8_t const> in, std::size_t& in_offset,
               std::span<uint8_t> out, std::size_t& out_offset) override {
    assert(in_offset <= in.size());

    if (!decided_) {
      if (eq(in, in_offset, "\xef\xbb\xbf"sv)) {
        decided_ = create_utf8_decoder();
      } else if (eq(in, in_offset, "\xff\xfe\x00\x00"sv)) {
        in_offset += 4;
        decided_ = create_utf32le_decoder();
      } else if (eq(in, in_offset, "\xff\xfe"sv)) {
        // Could be UTF-32 BOM, need more data to decide
        // (note, an xml document encoded in UTF-16 that is less than 4 bytes
        //  is rather impossible).
        if (in.size() - in_offset < 4)
          return State::NEED_MORE;
        in_offset += 2;
        decided_ = create_utf16le_decoder();
      } else if (eq(in, in_offset, "\xfe\xff"sv)) {
        in_offset += 2;
        decided_ = create_utf16be_decoder();
      } else if (eq(in, in_offset, "\x00\x00\xfe\xff"sv)) {
        in_offset += 4;
        decided_ = create_utf32be_decoder();
      } else {
        auto avail = in.size() - in_offset;
        if (avail == 0)
          return State::NEED_MORE;
        if (avail >= 4 && in[in_offset] == 0 && in[in_offset + 1] == 0
            && in[in_offset + 2] == 0 && in[in_offset + 3] != 0) {
          decided_ = create_utf32be_decoder();
        } else if (avail >= 4 && in[in_offset] != 0 && in[in_offset + 1] == 0
                   && in[in_offset + 2] == 0 && in[in_offset + 3] == 0) {
          decided_ = create_utf32le_decoder();
        } else if (avail >= 2 && in[in_offset] == 0 && in[in_offset + 1] != 0) {
          decided_ = create_utf16be_decoder();
        } else if (avail >= 2 && in[in_offset] != 0 && in[in_offset + 1] == 0) {
          decided_ = create_utf16le_decoder();
        } else {
          auto tmp = in_offset;
          auto ret = utf::read8(in, tmp);
          if (ret == utf::NEED_MORE)
            return State::NEED_MORE;
          if (ret == utf::INVALID)
            return State::INVALID;
          // UTF-8 should be good enough to read the XML declaration.
          decided_ = create_utf8_decoder();
        }
      }
    }
    return decided_->decode(in, in_offset, out, out_offset);
  }

 private:
  std::unique_ptr<Decoder> decided_;
};

}  // namespace

std::unique_ptr<Decoder> create_guessing_decoder() {
  return std::make_unique<GuessingDecoder>();
}

}  // namespace sax
}  // namespace modxml