summaryrefslogtreecommitdiff
path: root/sax/inc/sax_decoder.hh
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@spawned.biz>2023-06-13 10:07:16 +0200
committerJoel Klinghed <the_jk@spawned.biz>2023-06-13 10:07:16 +0200
commitfc4547b412e28164af1bf8981234c6af959ccc0b (patch)
tree061253e7a4f6abaca282223b36d10f0bed8cad23 /sax/inc/sax_decoder.hh
WIP
Diffstat (limited to 'sax/inc/sax_decoder.hh')
-rw-r--r--sax/inc/sax_decoder.hh57
1 files changed, 57 insertions, 0 deletions
diff --git a/sax/inc/sax_decoder.hh b/sax/inc/sax_decoder.hh
new file mode 100644
index 0000000..40a56c9
--- /dev/null
+++ b/sax/inc/sax_decoder.hh
@@ -0,0 +1,57 @@
+#ifndef SAX_DECODER_HH
+#define SAX_DECODER_HH
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+namespace modxml {
+namespace sax {
+
+/**
+ * Decoder returned by DecoderFactory. Used by Processor to turn bytes into
+ * unicode characters.
+ */
+class Decoder {
+ public:
+ virtual ~Decoder() = default;
+
+ enum class State {
+ GOOD = 0,
+ // too little data was given to advance
+ NEED_MORE,
+ // invalid data was given to advance
+ INVALID,
+ };
+
+ /**
+ * Decode as many code points as possible from in (start at in_offset) and
+ * write them to out (start at out_offset) as UTF-8.
+ * All written code points must be valid per Unicode, so inside the
+ * range U+0 to U+10FFFF and not a surrogate pair (U+D800-U+DFFF).
+ * No partial output, only write to out if the whole UTF-8 sequence is
+ * going to fit.
+ * The is always at least 4 bytes available (out.size() - out_offset) when
+ * called.
+ * Advance in_offset for data consumed.
+ * Advance out_offset for code points written. Do NOT write past out.size().
+ * Do NOT resize out.
+ * If at least one code point is decoded and written to out, return GOOD.
+ * If it is not possible to decode a single code point, in_offset and
+ * out_offset should not be advanced and something other than GOOD returned.
+ * Do not keep any references to any of the parameters after returning, next
+ * advance() call will point to the following bytes, but all parameters
+ * may have changed as they are subject to the buffer implementations of the
+ * Processor.
+ */
+ virtual State decode(std::string_view in, std::size_t& in_offset,
+ std::string& out, std::size_t& out_offset) = 0;
+
+ protected:
+ Decoder() = default;
+};
+
+} // namespace sax
+} // namespace modxml
+
+#endif // SAX_DECODER_HH