#include "decoder.hh" #include "macros.hh" #include "sax_decoder.hh" #include "utf16.hh" #include "utf32.hh" #include "utf8.hh" #include "utf_error.hh" namespace modxml { namespace sax { namespace { class UtfDecoder : public Decoder { public: State decode(std::string_view in, std::size_t& in_offset, uint32_t* out, std::size_t out_size, std::size_t& out_offset) override { std::size_t const out_start = out_offset; if (bom_ == -1) UNLIKELY { std::size_t tmp = in_offset; uint32_t ret = read(in, tmp); if (ret == utf::NEED_MORE) { return State::NEED_MORE; } if (ret == utf::INVALID) { return State::INVALID; } if (ret == 0xfeff) { // To allow offset to advance and to return, we need to // read at least one more character completely. ret = read(in, tmp); if (ret == utf::NEED_MORE) { return State::NEED_MORE; } if (ret == utf::INVALID) { return State::INVALID; } bom_ = 1; } else { bom_ = 0; } in_offset = tmp; out[out_offset++] = ret; if (out_offset == out_size) return State::GOOD; } do { uint32_t ret = read(in, in_offset); if (ret == utf::NEED_MORE) { return out_offset > out_start ? State::GOOD : State::NEED_MORE; } if (ret == utf::INVALID) { return out_offset > out_start ? State::GOOD : State::INVALID; } out[out_offset++] = ret; } while (out_offset < out_size); return State::GOOD; } protected: UtfDecoder() = default; virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0; private: int8_t bom_{-1}; }; class Utf8Decoder : public UtfDecoder { public: Utf8Decoder() = default; uint32_t read(std::string_view data, std::size_t& offset) const override { return utf::read8(data, offset); } }; class Utf16BeDecoder : public UtfDecoder { public: Utf16BeDecoder() = default; uint32_t read(std::string_view data, std::size_t& offset) const override { return utf::read16be(data, offset); } }; class Utf16LeDecoder : public UtfDecoder { public: Utf16LeDecoder() = default; uint32_t read(std::string_view data, std::size_t& offset) const override { return utf::read16le(data, offset); } }; class Utf32BeDecoder : public UtfDecoder { public: Utf32BeDecoder() = default; uint32_t read(std::string_view data, std::size_t& offset) const override { return utf::read32be(data, offset); } }; class Utf32LeDecoder : public UtfDecoder { public: Utf32LeDecoder() = default; uint32_t read(std::string_view data, std::size_t& offset) const override { return utf::read32le(data, offset); } }; class Utf16Decoder : public Decoder { public: Utf16Decoder() = default; State decode(std::string_view in, std::size_t& in_offset, uint32_t* out, std::size_t out_size, std::size_t& out_offset) override { std::size_t const out_start = out_offset; if (endian_ == -1) UNLIKELY { std::size_t tmp = in_offset; uint32_t ret = utf::read16be(in, tmp); int8_t endian; if (ret == utf::NEED_MORE) { return State::NEED_MORE; } if (ret == utf::INVALID) { return State::INVALID; } if (ret == 0xfeff) { endian = 1; // Big endian } else if (ret == 0xfffe) { endian = 0; // Little endian } else { return State::INVALID; } // To allow offset to advance and to return, we need to // read at least one more character completely. ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp); if (ret == utf::NEED_MORE) { return State::NEED_MORE; } if (ret == utf::INVALID) { return State::INVALID; } endian_ = endian; in_offset = tmp; out[out_offset++] = ret; if (out_offset == out_size) return State::GOOD; } if (endian_ == 1) { do { uint32_t ret = utf::read16be(in, in_offset); if (ret == utf::NEED_MORE) { return out_offset > out_start ? State::GOOD : State::NEED_MORE; } if (ret == utf::INVALID) { return out_offset > out_start ? State::GOOD : State::INVALID; } out[out_offset++] = ret; } while (out_offset < out_size); } else { do { uint32_t ret = utf::read16le(in, in_offset); if (ret == utf::NEED_MORE) { return out_offset > out_start ? State::GOOD : State::NEED_MORE; } if (ret == utf::INVALID) { return out_offset > out_start ? State::GOOD : State::INVALID; } out[out_offset++] = ret; } while (out_offset < out_size); } return State::GOOD; } private: int8_t endian_{-1}; }; class Utf32Decoder : public Decoder { public: Utf32Decoder() = default; State decode(std::string_view in, std::size_t& in_offset, uint32_t* out, std::size_t out_size, std::size_t& out_offset) override { std::size_t const out_start = out_offset; if (endian_ == -1) UNLIKELY { std::size_t tmp = in_offset; uint32_t ret = utf::read32be(in, tmp); int8_t endian; if (ret == utf::NEED_MORE) { return State::NEED_MORE; } if (ret == utf::INVALID) { tmp = in_offset; ret = utf::read32le(in, tmp); if (ret == 0xfeff) { endian = 0; // Little endian } else { return State::INVALID; } } else if (ret == 0xfeff) { endian = 1; // Big endian } else { return State::INVALID; } // To allow offset to advance and to return, we need to // read the next character completely. ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp); if (ret == utf::NEED_MORE) { return State::NEED_MORE; } if (ret == utf::INVALID) { return State::INVALID; } endian_ = endian; in_offset = tmp; out[out_offset++] = ret; if (out_offset == out_size) return State::GOOD; } if (endian_ == 1) { do { uint32_t ret = utf::read32be(in, in_offset); if (ret == utf::NEED_MORE) { return out_offset > out_start ? State::GOOD : State::NEED_MORE; } if (ret == utf::INVALID) { return out_offset > out_start ? State::GOOD : State::INVALID; } out[out_offset++] = ret; } while (out_offset < out_size); } else { do { uint32_t ret = utf::read32le(in, in_offset); if (ret == utf::NEED_MORE) { return out_offset > out_start ? State::GOOD : State::NEED_MORE; } if (ret == utf::INVALID) { return out_offset > out_start ? State::GOOD : State::INVALID; } out[out_offset++] = ret; } while (out_offset < out_size); } return State::GOOD; } private: int8_t endian_{-1}; }; class AsciiDecoder : public Decoder { public: AsciiDecoder() = default; State decode(std::string_view in, std::size_t& in_offset, uint32_t* out, std::size_t out_size, std::size_t& out_offset) override { std::size_t const out_start = out_offset; do { if (in_offset == in.size()) return out_offset > out_start ? State::GOOD : State::NEED_MORE; if (in[in_offset] & 0x80) return out_offset > out_start ? State::GOOD : State::INVALID; out[out_offset++] = in[in_offset++]; } while (out_offset < out_size); return State::GOOD; } }; } // namespace std::unique_ptr create_utf8_decoder() { return std::make_unique(); } std::unique_ptr create_utf16be_decoder() { return std::make_unique(); } std::unique_ptr create_utf16le_decoder() { return std::make_unique(); } std::unique_ptr create_utf32be_decoder() { return std::make_unique(); } std::unique_ptr create_utf32le_decoder() { return std::make_unique(); } std::unique_ptr create_utf16_decoder() { return std::make_unique(); } std::unique_ptr create_utf32_decoder() { return std::make_unique(); } std::unique_ptr create_ascii_decoder() { return std::make_unique(); } } // namespace sax } // namespace modxml