diff options
| author | Joel Klinghed <the_jk@spawned.biz> | 2023-06-13 10:07:16 +0200 |
|---|---|---|
| committer | Joel Klinghed <the_jk@spawned.biz> | 2023-06-13 10:07:16 +0200 |
| commit | fc4547b412e28164af1bf8981234c6af959ccc0b (patch) | |
| tree | 061253e7a4f6abaca282223b36d10f0bed8cad23 /sax/src/decoder.cc | |
WIP
Diffstat (limited to 'sax/src/decoder.cc')
| -rw-r--r-- | sax/src/decoder.cc | 321 |
1 files changed, 321 insertions, 0 deletions
diff --git a/sax/src/decoder.cc b/sax/src/decoder.cc new file mode 100644 index 0000000..30b1735 --- /dev/null +++ b/sax/src/decoder.cc @@ -0,0 +1,321 @@ +#include "decoder.hh" + +#include "macros.hh" +#include "sax_decoder.hh" +#include "utf16.hh" +#include "utf32.hh" +#include "utf8.hh" +#include "utf_error.hh" + +namespace modxml { +namespace sax { + +namespace { + +class UtfDecoder : public Decoder { + public: + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + if (bom_ == -1) UNLIKELY { + std::size_t tmp = in_offset; + uint32_t ret = read(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + if (ret == 0xfeff) { + // To allow offset to advance and to return, we need to + // read at least one more character completely. + ret = read(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + bom_ = 1; + } else { + bom_ = 0; + } + in_offset = tmp; + out[out_offset++] = ret; + if (out_offset == out_size) + return State::GOOD; + } + + do { + uint32_t ret = read(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + return State::GOOD; + } + + protected: + UtfDecoder() = default; + + virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0; + + private: + int8_t bom_{-1}; +}; + +class Utf8Decoder : public UtfDecoder { + public: + Utf8Decoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read8(data, offset); + } +}; + +class Utf16BeDecoder : public UtfDecoder { + public: + Utf16BeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read16be(data, offset); + } +}; + +class Utf16LeDecoder : public UtfDecoder { + public: + Utf16LeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read16le(data, offset); + } +}; + +class Utf32BeDecoder : public UtfDecoder { + public: + Utf32BeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read32be(data, offset); + } +}; + +class Utf32LeDecoder : public UtfDecoder { + public: + Utf32LeDecoder() = default; + + uint32_t read(std::string_view data, std::size_t& offset) const override { + return utf::read32le(data, offset); + } +}; + +class Utf16Decoder : public Decoder { + public: + Utf16Decoder() = default; + + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + if (endian_ == -1) UNLIKELY { + std::size_t tmp = in_offset; + uint32_t ret = utf::read16be(in, tmp); + int8_t endian; + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + if (ret == 0xfeff) { + endian = 1; // Big endian + } else if (ret == 0xfffe) { + endian = 0; // Little endian + } else { + return State::INVALID; + } + + // To allow offset to advance and to return, we need to + // read at least one more character completely. + ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + + endian_ = endian; + in_offset = tmp; + out[out_offset++] = ret; + if (out_offset == out_size) + return State::GOOD; + } + + if (endian_ == 1) { + do { + uint32_t ret = utf::read16be(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } else { + do { + uint32_t ret = utf::read16le(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } + return State::GOOD; + } + + private: + int8_t endian_{-1}; +}; + +class Utf32Decoder : public Decoder { + public: + Utf32Decoder() = default; + + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + if (endian_ == -1) UNLIKELY { + std::size_t tmp = in_offset; + uint32_t ret = utf::read32be(in, tmp); + int8_t endian; + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + tmp = in_offset; + ret = utf::read32le(in, tmp); + if (ret == 0xfeff) { + endian = 0; // Little endian + } else { + return State::INVALID; + } + } else if (ret == 0xfeff) { + endian = 1; // Big endian + } else { + return State::INVALID; + } + + // To allow offset to advance and to return, we need to + // read the next character completely. + ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp); + if (ret == utf::NEED_MORE) { + return State::NEED_MORE; + } + if (ret == utf::INVALID) { + return State::INVALID; + } + + endian_ = endian; + in_offset = tmp; + out[out_offset++] = ret; + if (out_offset == out_size) + return State::GOOD; + } + + if (endian_ == 1) { + do { + uint32_t ret = utf::read32be(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } else { + do { + uint32_t ret = utf::read32le(in, in_offset); + if (ret == utf::NEED_MORE) { + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + } + if (ret == utf::INVALID) { + return out_offset > out_start ? State::GOOD : State::INVALID; + } + out[out_offset++] = ret; + } while (out_offset < out_size); + } + return State::GOOD; + } + + private: + int8_t endian_{-1}; +}; + +class AsciiDecoder : public Decoder { + public: + AsciiDecoder() = default; + + State decode(std::string_view in, std::size_t& in_offset, + uint32_t* out, std::size_t out_size, + std::size_t& out_offset) override { + std::size_t const out_start = out_offset; + do { + if (in_offset == in.size()) + return out_offset > out_start ? State::GOOD : State::NEED_MORE; + if (in[in_offset] & 0x80) + return out_offset > out_start ? State::GOOD : State::INVALID; + out[out_offset++] = in[in_offset++]; + } while (out_offset < out_size); + return State::GOOD; + } +}; + +} // namespace + +std::unique_ptr<Decoder> create_utf8_decoder() { + return std::make_unique<Utf8Decoder>(); +} + +std::unique_ptr<Decoder> create_utf16be_decoder() { + return std::make_unique<Utf16BeDecoder>(); +} + +std::unique_ptr<Decoder> create_utf16le_decoder() { + return std::make_unique<Utf16LeDecoder>(); +} + +std::unique_ptr<Decoder> create_utf32be_decoder() { + return std::make_unique<Utf32BeDecoder>(); +} + +std::unique_ptr<Decoder> create_utf32le_decoder() { + return std::make_unique<Utf32LeDecoder>(); +} + +std::unique_ptr<Decoder> create_utf16_decoder() { + return std::make_unique<Utf16Decoder>(); +} + +std::unique_ptr<Decoder> create_utf32_decoder() { + return std::make_unique<Utf32Decoder>(); +} + +std::unique_ptr<Decoder> create_ascii_decoder() { + return std::make_unique<AsciiDecoder>(); +} + +} // namespace sax +} // namespace modxml + |
