1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
#include "guessing_decoder.hh"
#include "decoder.hh"
#include "sax_decoder.hh"
#include "utf8.hh"
#include "utf_error.hh"
#include <cassert>
using namespace std::string_view_literals;
namespace modxml {
namespace sax {
namespace {
bool eq(std::span<uint8_t const> a, std::size_t& a_offset, std::string_view b) {
if (a.size() - a_offset < b.size())
return false;
for (size_t i = 0; i < b.size(); ++i)
if (a[a_offset + i] != static_cast<uint8_t>(b[i]))
return false;
return true;
}
class GuessingDecoder : public Decoder {
public:
State decode(std::span<uint8_t const> in, std::size_t& in_offset,
std::span<uint8_t> out, std::size_t& out_offset) override {
assert(in_offset <= in.size());
if (!decided_) {
if (eq(in, in_offset, "\xef\xbb\xbf"sv)) {
decided_ = create_utf8_decoder();
} else if (eq(in, in_offset, "\xff\xfe\x00\x00"sv)) {
in_offset += 4;
decided_ = create_utf32le_decoder();
} else if (eq(in, in_offset, "\xff\xfe"sv)) {
// Could be UTF-32 BOM, need more data to decide
// (note, an xml document encoded in UTF-16 that is less than 4 bytes
// is rather impossible).
if (in.size() - in_offset < 4)
return State::NEED_MORE;
in_offset += 2;
decided_ = create_utf16le_decoder();
} else if (eq(in, in_offset, "\xfe\xff"sv)) {
in_offset += 2;
decided_ = create_utf16be_decoder();
} else if (eq(in, in_offset, "\x00\x00\xfe\xff"sv)) {
in_offset += 4;
decided_ = create_utf32be_decoder();
} else {
auto avail = in.size() - in_offset;
if (avail == 0)
return State::NEED_MORE;
if (avail >= 4 && in[in_offset] == 0 && in[in_offset + 1] == 0
&& in[in_offset + 2] == 0 && in[in_offset + 3] != 0) {
decided_ = create_utf32be_decoder();
} else if (avail >= 4 && in[in_offset] != 0 && in[in_offset + 1] == 0
&& in[in_offset + 2] == 0 && in[in_offset + 3] == 0) {
decided_ = create_utf32le_decoder();
} else if (avail >= 2 && in[in_offset] == 0 && in[in_offset + 1] != 0) {
decided_ = create_utf16be_decoder();
} else if (avail >= 2 && in[in_offset] != 0 && in[in_offset + 1] == 0) {
decided_ = create_utf16le_decoder();
} else {
auto tmp = in_offset;
auto ret = utf::read8(in, tmp);
if (ret == utf::NEED_MORE)
return State::NEED_MORE;
if (ret == utf::INVALID)
return State::INVALID;
// UTF-8 should be good enough to read the XML declaration.
decided_ = create_utf8_decoder();
}
}
}
return decided_->decode(in, in_offset, out, out_offset);
}
private:
std::unique_ptr<Decoder> decided_;
};
} // namespace
std::unique_ptr<Decoder> create_guessing_decoder() {
return std::make_unique<GuessingDecoder>();
}
} // namespace sax
} // namespace modxml
|