summaryrefslogtreecommitdiff
path: root/sax/src/decoder.cc
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@spawned.biz>2023-06-13 10:07:16 +0200
committerJoel Klinghed <the_jk@spawned.biz>2023-06-13 10:07:16 +0200
commitfc4547b412e28164af1bf8981234c6af959ccc0b (patch)
tree061253e7a4f6abaca282223b36d10f0bed8cad23 /sax/src/decoder.cc
WIP
Diffstat (limited to 'sax/src/decoder.cc')
-rw-r--r--sax/src/decoder.cc321
1 files changed, 321 insertions, 0 deletions
diff --git a/sax/src/decoder.cc b/sax/src/decoder.cc
new file mode 100644
index 0000000..30b1735
--- /dev/null
+++ b/sax/src/decoder.cc
@@ -0,0 +1,321 @@
+#include "decoder.hh"
+
+#include "macros.hh"
+#include "sax_decoder.hh"
+#include "utf16.hh"
+#include "utf32.hh"
+#include "utf8.hh"
+#include "utf_error.hh"
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+class UtfDecoder : public Decoder {
+ public:
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ if (bom_ == -1) UNLIKELY {
+ std::size_t tmp = in_offset;
+ uint32_t ret = read(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+ if (ret == 0xfeff) {
+ // To allow offset to advance and to return, we need to
+ // read at least one more character completely.
+ ret = read(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+ bom_ = 1;
+ } else {
+ bom_ = 0;
+ }
+ in_offset = tmp;
+ out[out_offset++] = ret;
+ if (out_offset == out_size)
+ return State::GOOD;
+ }
+
+ do {
+ uint32_t ret = read(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ return State::GOOD;
+ }
+
+ protected:
+ UtfDecoder() = default;
+
+ virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0;
+
+ private:
+ int8_t bom_{-1};
+};
+
+class Utf8Decoder : public UtfDecoder {
+ public:
+ Utf8Decoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read8(data, offset);
+ }
+};
+
+class Utf16BeDecoder : public UtfDecoder {
+ public:
+ Utf16BeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read16be(data, offset);
+ }
+};
+
+class Utf16LeDecoder : public UtfDecoder {
+ public:
+ Utf16LeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read16le(data, offset);
+ }
+};
+
+class Utf32BeDecoder : public UtfDecoder {
+ public:
+ Utf32BeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read32be(data, offset);
+ }
+};
+
+class Utf32LeDecoder : public UtfDecoder {
+ public:
+ Utf32LeDecoder() = default;
+
+ uint32_t read(std::string_view data, std::size_t& offset) const override {
+ return utf::read32le(data, offset);
+ }
+};
+
+class Utf16Decoder : public Decoder {
+ public:
+ Utf16Decoder() = default;
+
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ if (endian_ == -1) UNLIKELY {
+ std::size_t tmp = in_offset;
+ uint32_t ret = utf::read16be(in, tmp);
+ int8_t endian;
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+ if (ret == 0xfeff) {
+ endian = 1; // Big endian
+ } else if (ret == 0xfffe) {
+ endian = 0; // Little endian
+ } else {
+ return State::INVALID;
+ }
+
+ // To allow offset to advance and to return, we need to
+ // read at least one more character completely.
+ ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+
+ endian_ = endian;
+ in_offset = tmp;
+ out[out_offset++] = ret;
+ if (out_offset == out_size)
+ return State::GOOD;
+ }
+
+ if (endian_ == 1) {
+ do {
+ uint32_t ret = utf::read16be(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ } else {
+ do {
+ uint32_t ret = utf::read16le(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ }
+ return State::GOOD;
+ }
+
+ private:
+ int8_t endian_{-1};
+};
+
+class Utf32Decoder : public Decoder {
+ public:
+ Utf32Decoder() = default;
+
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ if (endian_ == -1) UNLIKELY {
+ std::size_t tmp = in_offset;
+ uint32_t ret = utf::read32be(in, tmp);
+ int8_t endian;
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ tmp = in_offset;
+ ret = utf::read32le(in, tmp);
+ if (ret == 0xfeff) {
+ endian = 0; // Little endian
+ } else {
+ return State::INVALID;
+ }
+ } else if (ret == 0xfeff) {
+ endian = 1; // Big endian
+ } else {
+ return State::INVALID;
+ }
+
+ // To allow offset to advance and to return, we need to
+ // read the next character completely.
+ ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp);
+ if (ret == utf::NEED_MORE) {
+ return State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return State::INVALID;
+ }
+
+ endian_ = endian;
+ in_offset = tmp;
+ out[out_offset++] = ret;
+ if (out_offset == out_size)
+ return State::GOOD;
+ }
+
+ if (endian_ == 1) {
+ do {
+ uint32_t ret = utf::read32be(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ } else {
+ do {
+ uint32_t ret = utf::read32le(in, in_offset);
+ if (ret == utf::NEED_MORE) {
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ }
+ if (ret == utf::INVALID) {
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ }
+ out[out_offset++] = ret;
+ } while (out_offset < out_size);
+ }
+ return State::GOOD;
+ }
+
+ private:
+ int8_t endian_{-1};
+};
+
+class AsciiDecoder : public Decoder {
+ public:
+ AsciiDecoder() = default;
+
+ State decode(std::string_view in, std::size_t& in_offset,
+ uint32_t* out, std::size_t out_size,
+ std::size_t& out_offset) override {
+ std::size_t const out_start = out_offset;
+ do {
+ if (in_offset == in.size())
+ return out_offset > out_start ? State::GOOD : State::NEED_MORE;
+ if (in[in_offset] & 0x80)
+ return out_offset > out_start ? State::GOOD : State::INVALID;
+ out[out_offset++] = in[in_offset++];
+ } while (out_offset < out_size);
+ return State::GOOD;
+ }
+};
+
+} // namespace
+
+std::unique_ptr<Decoder> create_utf8_decoder() {
+ return std::make_unique<Utf8Decoder>();
+}
+
+std::unique_ptr<Decoder> create_utf16be_decoder() {
+ return std::make_unique<Utf16BeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf16le_decoder() {
+ return std::make_unique<Utf16LeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf32be_decoder() {
+ return std::make_unique<Utf32BeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf32le_decoder() {
+ return std::make_unique<Utf32LeDecoder>();
+}
+
+std::unique_ptr<Decoder> create_utf16_decoder() {
+ return std::make_unique<Utf16Decoder>();
+}
+
+std::unique_ptr<Decoder> create_utf32_decoder() {
+ return std::make_unique<Utf32Decoder>();
+}
+
+std::unique_ptr<Decoder> create_ascii_decoder() {
+ return std::make_unique<AsciiDecoder>();
+}
+
+} // namespace sax
+} // namespace modxml
+