blob: 40a56c9eba35c5b5ea867a8e740a590c09268ead (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
|
#ifndef SAX_DECODER_HH
#define SAX_DECODER_HH
#include <memory>
#include <string>
#include <string_view>
namespace modxml {
namespace sax {
/**
* Decoder returned by DecoderFactory. Used by Processor to turn bytes into
* unicode characters.
*/
class Decoder {
public:
virtual ~Decoder() = default;
enum class State {
GOOD = 0,
// too little data was given to advance
NEED_MORE,
// invalid data was given to advance
INVALID,
};
/**
* Decode as many code points as possible from in (start at in_offset) and
* write them to out (start at out_offset) as UTF-8.
* All written code points must be valid per Unicode, so inside the
* range U+0 to U+10FFFF and not a surrogate pair (U+D800-U+DFFF).
* No partial output, only write to out if the whole UTF-8 sequence is
* going to fit.
* The is always at least 4 bytes available (out.size() - out_offset) when
* called.
* Advance in_offset for data consumed.
* Advance out_offset for code points written. Do NOT write past out.size().
* Do NOT resize out.
* If at least one code point is decoded and written to out, return GOOD.
* If it is not possible to decode a single code point, in_offset and
* out_offset should not be advanced and something other than GOOD returned.
* Do not keep any references to any of the parameters after returning, next
* advance() call will point to the following bytes, but all parameters
* may have changed as they are subject to the buffer implementations of the
* Processor.
*/
virtual State decode(std::string_view in, std::size_t& in_offset,
std::string& out, std::size_t& out_offset) = 0;
protected:
Decoder() = default;
};
} // namespace sax
} // namespace modxml
#endif // SAX_DECODER_HH
|