blob: 8b2490c7e18a1cd85baeb8bd1ee1d9afc8da58ac (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
#ifndef SAX_DECODER_HH
#define SAX_DECODER_HH
#include <cstdint>
#include <span>
namespace modxml {
namespace sax {
/**
* Decoder returned by DecoderFactory. Used by Processor to turn bytes into
* unicode characters encoded as UTF-8.
*/
class Decoder {
public:
virtual ~Decoder() = default;
enum class State {
GOOD = 0,
// too little data was given to decode
NEED_MORE,
// invalid data was given to decode
INVALID,
};
/**
* Decode as many code points as possible from in (start at in_offset) and
* write them to out (start at out_offset) as UTF-8.
* All written code points must be valid per Unicode, so inside the
* range U+0 to U+10FFFF and not a surrogate pair (U+D800-U+DFFF).
* No partial code point output, only write to out if the whole UTF-8
* sequence for the code point is going to fit.
* There will always at least 4 bytes available (out.size() - out_offset) when
* called.
* Advance in_offset for data consumed. Do NOT read past in.size().
* Advance out_offset for code points written. Do NOT write past out.size().
* If at least one code point is decoded and written to out, return GOOD.
* If it is not possible to decode a single code point, in_offset and
* out_offset should not be advanced and something other than GOOD returned.
* Do not keep any references to any of the parameters after returning, next
* decode() call will point to the following bytes, but all parameters
* may have changed as they are subject to the buffer implementations of the
* Processor.
*/
virtual State decode(std::span<uint8_t const> in, std::size_t& in_offset,
std::span<uint8_t> out, std::size_t& out_offset) = 0;
protected:
Decoder() = default;
};
} // namespace sax
} // namespace modxml
#endif // SAX_DECODER_HH
|