From fc4547b412e28164af1bf8981234c6af959ccc0b Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Tue, 13 Jun 2023 10:07:16 +0200 Subject: WIP --- sax/inc/sax_decoder.hh | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 sax/inc/sax_decoder.hh (limited to 'sax/inc/sax_decoder.hh') diff --git a/sax/inc/sax_decoder.hh b/sax/inc/sax_decoder.hh new file mode 100644 index 0000000..40a56c9 --- /dev/null +++ b/sax/inc/sax_decoder.hh @@ -0,0 +1,57 @@ +#ifndef SAX_DECODER_HH +#define SAX_DECODER_HH + +#include +#include +#include + +namespace modxml { +namespace sax { + +/** + * Decoder returned by DecoderFactory. Used by Processor to turn bytes into + * unicode characters. + */ +class Decoder { + public: + virtual ~Decoder() = default; + + enum class State { + GOOD = 0, + // too little data was given to advance + NEED_MORE, + // invalid data was given to advance + INVALID, + }; + + /** + * Decode as many code points as possible from in (start at in_offset) and + * write them to out (start at out_offset) as UTF-8. + * All written code points must be valid per Unicode, so inside the + * range U+0 to U+10FFFF and not a surrogate pair (U+D800-U+DFFF). + * No partial output, only write to out if the whole UTF-8 sequence is + * going to fit. + * The is always at least 4 bytes available (out.size() - out_offset) when + * called. + * Advance in_offset for data consumed. + * Advance out_offset for code points written. Do NOT write past out.size(). + * Do NOT resize out. + * If at least one code point is decoded and written to out, return GOOD. + * If it is not possible to decode a single code point, in_offset and + * out_offset should not be advanced and something other than GOOD returned. + * Do not keep any references to any of the parameters after returning, next + * advance() call will point to the following bytes, but all parameters + * may have changed as they are subject to the buffer implementations of the + * Processor. + */ + virtual State decode(std::string_view in, std::size_t& in_offset, + std::string& out, std::size_t& out_offset) = 0; + + protected: + Decoder() = default; +}; + +} // namespace sax +} // namespace modxml + +#endif // SAX_DECODER_HH -- cgit v1.2.3-70-g09d2