#include "decoder.hh" #include "macros.hh" #include "sax_decoder.hh" #include "utf16.hh" #include "utf32.hh" #include "utf8.hh" #include "utf_error.hh" namespace modxml { namespace sax { namespace { class KnownEndianDecoder : public Decoder { public: State decode(std::span in, std::size_t& in_offset, std::span out, std::size_t& out_offset) override { std::size_t tmp = in_offset; uint32_t ret = read(in, tmp); if (ret == utf::NEED_MORE) return State::NEED_MORE; if (ret == utf::INVALID) return State::INVALID; if (bom_ == -1) UNLIKELY { if (ret == 0xfeff) { // To allow offset to advance and to return, we need to // read at least one more character completely. ret = read(in, tmp); if (ret == utf::NEED_MORE) return State::NEED_MORE; if (ret == utf::INVALID) return State::INVALID; bom_ = 1; } else { bom_ = 0; } if (!utf::write8(ret, out, out_offset)) { bom_ = -1; return State::NEED_MORE; } } else { if (!utf::write8(ret, out, out_offset)) return State::NEED_MORE; } in_offset = tmp; while (true) { ret = read(in, tmp); if (ret == utf::NEED_MORE || ret == utf::INVALID) return State::GOOD; if (!utf::write8(ret, out, out_offset)) return State::GOOD; in_offset = tmp; } } protected: KnownEndianDecoder() = default; virtual uint32_t read( std::span data, std::size_t& offset) const = 0; private: int8_t bom_{-1}; }; class Utf8Decoder : public KnownEndianDecoder { public: Utf8Decoder() = default; uint32_t read( std::span data, std::size_t& offset) const override { return utf::read8(data, offset); } }; class Utf16BeDecoder : public KnownEndianDecoder { public: Utf16BeDecoder() = default; uint32_t read( std::span data, std::size_t& offset) const override { return utf::read16be(data, offset); } }; class Utf16LeDecoder : public KnownEndianDecoder { public: Utf16LeDecoder() = default; uint32_t read( std::span data, std::size_t& offset) const override { return utf::read16le(data, offset); } }; class Utf32BeDecoder : public KnownEndianDecoder { public: Utf32BeDecoder() = default; uint32_t read( std::span data, std::size_t& offset) const override { return utf::read32be(data, offset); } }; class Utf32LeDecoder : public KnownEndianDecoder { public: Utf32LeDecoder() = default; uint32_t read( std::span data, std::size_t& offset) const override { return utf::read32le(data, offset); } }; class UnknownEndianDecoder : public Decoder { public: State decode(std::span in, std::size_t& in_offset, std::span out, std::size_t& out_offset) override { std::size_t tmp = in_offset; if (endian_ == -1) UNLIKELY { uint32_t ret = readbe(in, tmp); if (ret == utf::NEED_MORE) return State::NEED_MORE; if (ret == utf::INVALID) return State::INVALID; if (ret == 0xfeff) { endian_ = 1; } else if (ret == 0xfffe) { endian_ = 0; } else { return State::INVALID; } in_offset = tmp; } if (endian_ == 0) { uint32_t ret = readle(in, tmp); if (ret == utf::NEED_MORE) return State::NEED_MORE; if (ret == utf::INVALID) return State::INVALID; if (!utf::write8(ret, out, out_offset)) return State::NEED_MORE; in_offset = tmp; while (true) { ret = readle(in, tmp); if (ret == utf::NEED_MORE || ret == utf::INVALID) return State::GOOD; if (!utf::write8(ret, out, out_offset)) return State::GOOD; in_offset = tmp; } } else /* if (endian_ == 1) */ { uint32_t ret = readbe(in, tmp); if (ret == utf::NEED_MORE) return State::NEED_MORE; if (ret == utf::INVALID) return State::INVALID; if (!utf::write8(ret, out, out_offset)) return State::NEED_MORE; in_offset = tmp; while (true) { ret = readbe(in, tmp); if (ret == utf::NEED_MORE || ret == utf::INVALID) return State::GOOD; if (!utf::write8(ret, out, out_offset)) return State::GOOD; in_offset = tmp; } } } protected: UnknownEndianDecoder() = default; virtual uint32_t readle( std::span data, std::size_t& offset) const = 0; virtual uint32_t readbe( std::span data, std::size_t& offset) const = 0; private: int8_t endian_{-1}; }; class Utf16Decoder : public UnknownEndianDecoder { public: Utf16Decoder() = default; uint32_t readle( std::span data, std::size_t& offset) const override { return utf::read16le(data, offset); } uint32_t readbe( std::span data, std::size_t& offset) const override { return utf::read16be(data, offset); } }; class Utf32Decoder : public UnknownEndianDecoder { public: Utf32Decoder() = default; uint32_t readle( std::span data, std::size_t& offset) const override { return utf::read32le(data, offset); } uint32_t readbe( std::span data, std::size_t& offset) const override { return utf::read32be(data, offset); } }; class AsciiDecoder : public Decoder { public: AsciiDecoder() = default; State decode(std::span in, std::size_t& in_offset, std::span out, std::size_t& out_offset) override { if (in_offset >= in.size()) return State::NEED_MORE; if (in[in_offset] & 0x80) return State::INVALID; if (!utf::write8(in[in_offset], out, out_offset)) return State::NEED_MORE; ++in_offset; while (true) { if (in_offset >= in.size() || in[in_offset] & 0x80) return State::GOOD; if (!utf::write8(in[in_offset], out, out_offset)) return State::GOOD; ++in_offset; } } }; } // namespace std::unique_ptr create_utf8_decoder() { return std::make_unique(); } std::unique_ptr create_utf16be_decoder() { return std::make_unique(); } std::unique_ptr create_utf16le_decoder() { return std::make_unique(); } std::unique_ptr create_utf32be_decoder() { return std::make_unique(); } std::unique_ptr create_utf32le_decoder() { return std::make_unique(); } std::unique_ptr create_utf16_decoder() { return std::make_unique(); } std::unique_ptr create_utf32_decoder() { return std::make_unique(); } std::unique_ptr create_ascii_decoder() { return std::make_unique(); } } // namespace sax } // namespace modxml