From 7dd49c6293172b494c78918507242cdb55d35137 Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Sun, 21 Jan 2024 12:31:30 +0100 Subject: WIP --- sax/src/decoder.cc | 308 +++++++++++++++++++++++------------------------------ 1 file changed, 134 insertions(+), 174 deletions(-) (limited to 'sax/src/decoder.cc') diff --git a/sax/src/decoder.cc b/sax/src/decoder.cc index 30b1735..35b9b46 100644 --- a/sax/src/decoder.cc +++ b/sax/src/decoder.cc @@ -12,273 +12,233 @@ namespace sax { namespace { -class UtfDecoder : public Decoder { +class KnownEndianDecoder : public Decoder { public: - State decode(std::string_view in, std::size_t& in_offset, - uint32_t* out, std::size_t out_size, - std::size_t& out_offset) override { - std::size_t const out_start = out_offset; + State decode(std::span in, std::size_t& in_offset, + std::span out, std::size_t& out_offset) override { + std::size_t tmp = in_offset; + uint32_t ret = read(in, tmp); + if (ret == utf::NEED_MORE) + return State::NEED_MORE; + if (ret == utf::INVALID) + return State::INVALID; + if (bom_ == -1) UNLIKELY { - std::size_t tmp = in_offset; - uint32_t ret = read(in, tmp); - if (ret == utf::NEED_MORE) { - return State::NEED_MORE; - } - if (ret == utf::INVALID) { - return State::INVALID; - } if (ret == 0xfeff) { // To allow offset to advance and to return, we need to // read at least one more character completely. ret = read(in, tmp); - if (ret == utf::NEED_MORE) { + if (ret == utf::NEED_MORE) return State::NEED_MORE; - } - if (ret == utf::INVALID) { + if (ret == utf::INVALID) return State::INVALID; - } bom_ = 1; } else { bom_ = 0; } - in_offset = tmp; - out[out_offset++] = ret; - if (out_offset == out_size) - return State::GOOD; + if (!utf::write8(ret, out, out_offset)) { + bom_ = -1; + return State::NEED_MORE; + } + } else { + if (!utf::write8(ret, out, out_offset)) + return State::NEED_MORE; } + in_offset = tmp; - do { - uint32_t ret = read(in, in_offset); - if (ret == utf::NEED_MORE) { - return out_offset > out_start ? State::GOOD : State::NEED_MORE; - } - if (ret == utf::INVALID) { - return out_offset > out_start ? State::GOOD : State::INVALID; - } - out[out_offset++] = ret; - } while (out_offset < out_size); - return State::GOOD; + while (true) { + ret = read(in, tmp); + if (ret == utf::NEED_MORE || ret == utf::INVALID) + return State::GOOD; + if (!utf::write8(ret, out, out_offset)) + return State::GOOD; + in_offset = tmp; + } } protected: - UtfDecoder() = default; + KnownEndianDecoder() = default; - virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0; + virtual uint32_t read( + std::span data, std::size_t& offset) const = 0; private: int8_t bom_{-1}; }; -class Utf8Decoder : public UtfDecoder { +class Utf8Decoder : public KnownEndianDecoder { public: Utf8Decoder() = default; - uint32_t read(std::string_view data, std::size_t& offset) const override { + uint32_t read( + std::span data, std::size_t& offset) const override { return utf::read8(data, offset); } }; -class Utf16BeDecoder : public UtfDecoder { +class Utf16BeDecoder : public KnownEndianDecoder { public: Utf16BeDecoder() = default; - uint32_t read(std::string_view data, std::size_t& offset) const override { + uint32_t read( + std::span data, std::size_t& offset) const override { return utf::read16be(data, offset); } }; -class Utf16LeDecoder : public UtfDecoder { +class Utf16LeDecoder : public KnownEndianDecoder { public: Utf16LeDecoder() = default; - uint32_t read(std::string_view data, std::size_t& offset) const override { + uint32_t read( + std::span data, std::size_t& offset) const override { return utf::read16le(data, offset); } }; -class Utf32BeDecoder : public UtfDecoder { +class Utf32BeDecoder : public KnownEndianDecoder { public: Utf32BeDecoder() = default; - uint32_t read(std::string_view data, std::size_t& offset) const override { + uint32_t read( + std::span data, std::size_t& offset) const override { return utf::read32be(data, offset); } }; -class Utf32LeDecoder : public UtfDecoder { +class Utf32LeDecoder : public KnownEndianDecoder { public: Utf32LeDecoder() = default; - uint32_t read(std::string_view data, std::size_t& offset) const override { + uint32_t read( + std::span data, std::size_t& offset) const override { return utf::read32le(data, offset); } }; -class Utf16Decoder : public Decoder { +class UnknownEndianDecoder : public Decoder { public: - Utf16Decoder() = default; - - State decode(std::string_view in, std::size_t& in_offset, - uint32_t* out, std::size_t out_size, - std::size_t& out_offset) override { - std::size_t const out_start = out_offset; + State decode(std::span in, std::size_t& in_offset, + std::span out, std::size_t& out_offset) override { + std::size_t tmp = in_offset; if (endian_ == -1) UNLIKELY { - std::size_t tmp = in_offset; - uint32_t ret = utf::read16be(in, tmp); - int8_t endian; - if (ret == utf::NEED_MORE) { + uint32_t ret = readbe(in, tmp); + if (ret == utf::NEED_MORE) return State::NEED_MORE; - } - if (ret == utf::INVALID) { + if (ret == utf::INVALID) return State::INVALID; - } if (ret == 0xfeff) { - endian = 1; // Big endian + endian_ = 1; } else if (ret == 0xfffe) { - endian = 0; // Little endian + endian_ = 0; } else { return State::INVALID; } + in_offset = tmp; + } - // To allow offset to advance and to return, we need to - // read at least one more character completely. - ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp); - if (ret == utf::NEED_MORE) { + if (endian_ == 0) { + uint32_t ret = readle(in, tmp); + if (ret == utf::NEED_MORE) return State::NEED_MORE; - } - if (ret == utf::INVALID) { + if (ret == utf::INVALID) return State::INVALID; - } + if (!utf::write8(ret, out, out_offset)) + return State::NEED_MORE; + in_offset = tmp; - endian_ = endian; + while (true) { + ret = readle(in, tmp); + if (ret == utf::NEED_MORE || ret == utf::INVALID) + return State::GOOD; + if (!utf::write8(ret, out, out_offset)) + return State::GOOD; + in_offset = tmp; + } + } else /* if (endian_ == 1) */ { + uint32_t ret = readbe(in, tmp); + if (ret == utf::NEED_MORE) + return State::NEED_MORE; + if (ret == utf::INVALID) + return State::INVALID; + if (!utf::write8(ret, out, out_offset)) + return State::NEED_MORE; in_offset = tmp; - out[out_offset++] = ret; - if (out_offset == out_size) - return State::GOOD; - } - if (endian_ == 1) { - do { - uint32_t ret = utf::read16be(in, in_offset); - if (ret == utf::NEED_MORE) { - return out_offset > out_start ? State::GOOD : State::NEED_MORE; - } - if (ret == utf::INVALID) { - return out_offset > out_start ? State::GOOD : State::INVALID; - } - out[out_offset++] = ret; - } while (out_offset < out_size); - } else { - do { - uint32_t ret = utf::read16le(in, in_offset); - if (ret == utf::NEED_MORE) { - return out_offset > out_start ? State::GOOD : State::NEED_MORE; - } - if (ret == utf::INVALID) { - return out_offset > out_start ? State::GOOD : State::INVALID; - } - out[out_offset++] = ret; - } while (out_offset < out_size); + while (true) { + ret = readbe(in, tmp); + if (ret == utf::NEED_MORE || ret == utf::INVALID) + return State::GOOD; + if (!utf::write8(ret, out, out_offset)) + return State::GOOD; + in_offset = tmp; + } } - return State::GOOD; } + protected: + UnknownEndianDecoder() = default; + + virtual uint32_t readle( + std::span data, std::size_t& offset) const = 0; + virtual uint32_t readbe( + std::span data, std::size_t& offset) const = 0; + private: int8_t endian_{-1}; }; -class Utf32Decoder : public Decoder { +class Utf16Decoder : public UnknownEndianDecoder { public: - Utf32Decoder() = default; + Utf16Decoder() = default; - State decode(std::string_view in, std::size_t& in_offset, - uint32_t* out, std::size_t out_size, - std::size_t& out_offset) override { - std::size_t const out_start = out_offset; - if (endian_ == -1) UNLIKELY { - std::size_t tmp = in_offset; - uint32_t ret = utf::read32be(in, tmp); - int8_t endian; - if (ret == utf::NEED_MORE) { - return State::NEED_MORE; - } - if (ret == utf::INVALID) { - tmp = in_offset; - ret = utf::read32le(in, tmp); - if (ret == 0xfeff) { - endian = 0; // Little endian - } else { - return State::INVALID; - } - } else if (ret == 0xfeff) { - endian = 1; // Big endian - } else { - return State::INVALID; - } + uint32_t readle( + std::span data, std::size_t& offset) const override { + return utf::read16le(data, offset); + } - // To allow offset to advance and to return, we need to - // read the next character completely. - ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp); - if (ret == utf::NEED_MORE) { - return State::NEED_MORE; - } - if (ret == utf::INVALID) { - return State::INVALID; - } + uint32_t readbe( + std::span data, std::size_t& offset) const override { + return utf::read16be(data, offset); + } +}; - endian_ = endian; - in_offset = tmp; - out[out_offset++] = ret; - if (out_offset == out_size) - return State::GOOD; - } +class Utf32Decoder : public UnknownEndianDecoder { + public: + Utf32Decoder() = default; - if (endian_ == 1) { - do { - uint32_t ret = utf::read32be(in, in_offset); - if (ret == utf::NEED_MORE) { - return out_offset > out_start ? State::GOOD : State::NEED_MORE; - } - if (ret == utf::INVALID) { - return out_offset > out_start ? State::GOOD : State::INVALID; - } - out[out_offset++] = ret; - } while (out_offset < out_size); - } else { - do { - uint32_t ret = utf::read32le(in, in_offset); - if (ret == utf::NEED_MORE) { - return out_offset > out_start ? State::GOOD : State::NEED_MORE; - } - if (ret == utf::INVALID) { - return out_offset > out_start ? State::GOOD : State::INVALID; - } - out[out_offset++] = ret; - } while (out_offset < out_size); - } - return State::GOOD; + uint32_t readle( + std::span data, std::size_t& offset) const override { + return utf::read32le(data, offset); } - private: - int8_t endian_{-1}; + uint32_t readbe( + std::span data, std::size_t& offset) const override { + return utf::read32be(data, offset); + } }; class AsciiDecoder : public Decoder { public: AsciiDecoder() = default; - State decode(std::string_view in, std::size_t& in_offset, - uint32_t* out, std::size_t out_size, - std::size_t& out_offset) override { - std::size_t const out_start = out_offset; - do { - if (in_offset == in.size()) - return out_offset > out_start ? State::GOOD : State::NEED_MORE; - if (in[in_offset] & 0x80) - return out_offset > out_start ? State::GOOD : State::INVALID; - out[out_offset++] = in[in_offset++]; - } while (out_offset < out_size); - return State::GOOD; + State decode(std::span in, std::size_t& in_offset, + std::span out, std::size_t& out_offset) override { + if (in_offset >= in.size()) + return State::NEED_MORE; + if (in[in_offset] & 0x80) + return State::INVALID; + if (!utf::write8(in[in_offset], out, out_offset)) + return State::NEED_MORE; + ++in_offset; + + while (true) { + if (in_offset >= in.size() || in[in_offset] & 0x80) + return State::GOOD; + if (!utf::write8(in[in_offset], out, out_offset)) + return State::GOOD; + ++in_offset; + } } }; -- cgit v1.2.3-70-g09d2