#include "uio.hh" #include "buffer.hh" #include "u16.hh" #include "u8.hh" #include #include #include #include #include #include #include #include namespace { constexpr u::ReaderInputFormat kU16NativeInputFormat = (std::endian::native == std::endian::big) ? u::ReaderInputFormat::UTF16_BE : u::ReaderInputFormat::UTF16_LE; constexpr u::ReaderInputFormat kU16SwapInputFormat = (std::endian::native == std::endian::big) ? u::ReaderInputFormat::UTF16_LE : u::ReaderInputFormat::UTF16_BE; constexpr size_t kByteBufferSize = 65535; constexpr size_t kUnicodeBufferSize = 8192; constexpr size_t kUSwapBufferSize = kByteBufferSize / 4; template class UnicodeReader : public io::Reader { public: UnicodeReader(std::unique_ptr in, u::ReaderConfig config) : in_(std::move(in)), config_(config), skip_bom_(config_.skip_bom) {} std::expected read(void* dst, size_t max) override { auto err = fill(); if (err.has_value()) return std::unexpected(err.value()); auto* in = reinterpret_cast(in_ptr_); auto read_err = reader_(in, in + in_avail_, in_eof_, u_buffer_wptr_, u_buffer_ + kUnicodeBufferSize); byte_buffer_->consume(in - reinterpret_cast(in_ptr_)); if (read_err.has_value()) { switch (read_err.value()) { case u::ReadError::Invalid: // Only return error if we have no bytes to output. if (u_buffer_wptr_ == u_buffer_) return std::unexpected(io::ReadError::InvalidData); break; case u::ReadError::End: break; case u::ReadError::Incomplete: if (in_eof_) { // Only return error if we have no bytes to output. if (u_buffer_wptr_ == u_buffer_) return std::unexpected(io::ReadError::InvalidData); } else { // We clearly need more data, call fill again. if (u_buffer_wptr_ == u_buffer_) return read(dst, max); } break; } } if (skip_bom_ && u_buffer_wptr_ > u_buffer_) { if (u_buffer_[0] == 0xfeff) { --u_buffer_wptr_; memmove(u_buffer_, u_buffer_ + 1, (u_buffer_wptr_ - u_buffer_) * sizeof(uint32_t)); } skip_bom_ = false; } auto* u_out = const_cast(u_buffer_); auto* d_out = dst; if (writer_(u_out, u_buffer_wptr_, d_out, max)) { assert(u_out == u_buffer_wptr_); u_buffer_wptr_ = u_buffer_; } else if (u_out == u_buffer_) { // Unable to write anything. if (max == 0) return 0; return std::unexpected(io::ReadError::MaxTooSmall); } else { size_t left = u_buffer_wptr_ - u_out; memmove(u_buffer_, u_out, left * sizeof(uint32_t)); u_buffer_wptr_ = u_buffer_ + left; } return reinterpret_cast(d_out) - reinterpret_cast(dst); } std::expected skip(size_t max) override { auto tmp = std::make_unique_for_overwrite(max); return read(tmp.get(), max); } private: std::optional fill() { in_ptr_ = byte_buffer_->rptr(in_avail_); if (!in_eof_ && in_avail_ < kByteBufferSize / 2) { auto* wptr = byte_buffer_->wptr(in_avail_); auto got = in_->read(wptr, in_avail_); if (got.has_value()) { byte_buffer_->commit(got.value()); if (got.value() == 0) in_eof_ = true; } else { return got.error(); } in_ptr_ = byte_buffer_->rptr(in_avail_); } return std::nullopt; } std::unique_ptr in_; u::ReaderConfig const config_; UReader reader_; UWriter writer_; bool skip_bom_; void const* in_ptr_{nullptr}; size_t in_avail_{0}; bool in_eof_{false}; std::unique_ptr byte_buffer_{Buffer::fixed(kByteBufferSize)}; uint32_t u_buffer_[kUnicodeBufferSize]; uint32_t* u_buffer_wptr_{u_buffer_}; }; struct U8ReaderStrict { std::optional operator()(uint8_t const*& in, uint8_t const* in_end, bool /* in_eof */, uint32_t*& out, uint32_t const* out_end) { std::optional ret; while (out < out_end) { auto const tmp = in; auto c = u8::read(in, in_end); if (c.has_value()) { *(out++) = c.value(); } else { ret = c.error(); in = tmp; break; } } return ret; } }; struct U8Reader { std::optional operator()(uint8_t const*& in, uint8_t const* in_end, bool in_eof, uint32_t*& out, uint32_t const* out_end) { std::optional ret; while (out < out_end) { auto c = u8::read_replace(in, in_end, in_eof); if (c.has_value()) { *(out++) = c.value(); } else { switch (c.error()) { case u::ReadErrorReplace::End: ret = u::ReadError::End; break; case u::ReadErrorReplace::Incomplete: ret = u::ReadError::Incomplete; break; } break; } } return ret; } }; struct U16NativeReaderStrict { std::optional operator()(uint8_t const*& in, uint8_t const* in_end, bool /* in_eof */, uint32_t*& out, uint32_t const* out_end) { auto* it = reinterpret_cast(in); auto* const end = it + ((in_end - in) / 2); if (it == end && in < in_end) return u::ReadError::Incomplete; std::optional ret; while (out < out_end) { auto const tmp = in; auto c = u16::read(it, end); if (c.has_value()) { *(out++) = c.value(); } else { ret = c.error(); in = tmp; break; } } in = reinterpret_cast(it); return ret; } }; struct U16SwapReaderStrict { std::optional operator()(uint8_t const*& in, uint8_t const* in_end, bool /* in_eof */, uint32_t*& out, uint32_t const* out_end) { auto* it = buffer_; auto* const end = it + ((in_end - in) / 2); if (it == end && in < in_end) return u::ReadError::Incomplete; { auto* in2 = reinterpret_cast(in); for (auto* it2 = it; it2 != end; ++it2) *it2 = std::byteswap(*(in2++)); } std::optional ret; while (out < out_end) { auto const tmp = in; auto c = u16::read(it, end); if (c.has_value()) { *(out++) = c.value(); } else { ret = c.error(); in = tmp; break; } } in += (it - buffer_) * 2; return ret; } private: uint16_t buffer_[kUSwapBufferSize]; }; struct U16NativeReader { std::optional operator()(uint8_t const*& in, uint8_t const* in_end, bool in_eof, uint32_t*& out, uint32_t const* out_end) { auto* it = reinterpret_cast(in); auto* const end = it + ((in_end - in) / 2); if (it == end && in < in_end) { if (out == out_end) return std::nullopt; if (in_eof) { *(out++) = 0xfffd; in = in_end; return std::nullopt; } return u::ReadError::Incomplete; } std::optional ret; while (out < out_end) { auto c = u16::read_replace(it, end, in_eof); if (c.has_value()) { *(out++) = c.value(); } else { switch (c.error()) { case u::ReadErrorReplace::End: ret = u::ReadError::End; break; case u::ReadErrorReplace::Incomplete: ret = u::ReadError::Incomplete; break; } break; } } in = reinterpret_cast(it); return ret; } }; struct U16SwapReader { std::optional operator()(uint8_t const*& in, uint8_t const* in_end, bool in_eof, uint32_t*& out, uint32_t const* out_end) { auto* it = buffer_; auto* const end = it + ((in_end - in) / 2); if (it == end && in < in_end) { if (out == out_end) return std::nullopt; if (in_eof) { *(out++) = 0xfffd; in = in_end; return std::nullopt; } return u::ReadError::Incomplete; } { auto* in2 = reinterpret_cast(in); for (auto* it2 = it; it2 != end; ++it2) *it2 = std::byteswap(*(in2++)); } std::optional ret; while (out < out_end) { auto c = u16::read_replace(it, end, in_eof); if (c.has_value()) { *(out++) = c.value(); } else { switch (c.error()) { case u::ReadErrorReplace::End: ret = u::ReadError::End; break; case u::ReadErrorReplace::Incomplete: ret = u::ReadError::Incomplete; break; } break; } } in += (it - buffer_) * 2; return ret; } private: uint16_t buffer_[kUSwapBufferSize]; }; bool detect(uint8_t const* in, uint8_t const* in_end, bool in_eof, u::ReaderInputFormat& format) { if (in == in_end) { if (in_eof) { // Doesn't matter, go with UTF-8 just to get out of "detect" format = u::ReaderInputFormat::UTF8; return true; } return false; } // UTF-8 BOM ? if (in_end - in >= 3) { if (in[0] == 0xef && in[1] == 0xbb && in[2] == 0xbf) { format = u::ReaderInputFormat::UTF8; return true; } } // UTF-16 BOM ? if (in_end - in >= 2) { auto* data = reinterpret_cast(in); if (data[0] == 0xFEFF) { format = kU16NativeInputFormat; return true; } if (data[0] == 0xFFFE) { format = kU16SwapInputFormat; return true; } } // Check for zero bytes, not allowed in UTF-8 and likely for UTF-16 // encoding western characters. if (in_end - in >= 2) { if (in[0] == 0x00 && in[1] != 0x00) { format = u::ReaderInputFormat::UTF16_BE; return true; } if (in[0] != 0x00 && in[1] == 0x00) { format = u::ReaderInputFormat::UTF16_LE; return true; } } if (in_end - in >= 2 || in_eof) { // We have no idea what it is, fallback to UTF-8 and let it "handle" // whatever the input data actually is. format = u::ReaderInputFormat::UTF8; return true; } return false; } struct DetectReaderStrict { std::optional operator()(uint8_t const*& in, uint8_t const* in_end, bool in_eof, uint32_t*& out, uint32_t const* out_end) { switch (format_) { case u::ReaderInputFormat::DETECT: if (detect(in, in_end, in_eof, format_)) { return operator()(in, in_end, in_eof, out, out_end); } return u::ReadError::Incomplete; case u::ReaderInputFormat::UTF8: return u8_reader_(in, in_end, in_eof, out, out_end); case kU16NativeInputFormat: return u16_native_reader_(in, in_end, in_eof, out, out_end); case kU16SwapInputFormat: return u16_swap_reader_(in, in_end, in_eof, out, out_end); } std::unreachable(); } private: u::ReaderInputFormat format_{u::ReaderInputFormat::DETECT}; U8ReaderStrict u8_reader_; U16NativeReaderStrict u16_native_reader_; U16SwapReaderStrict u16_swap_reader_; }; struct DetectReader { std::optional operator()(uint8_t const*& in, uint8_t const* in_end, bool in_eof, uint32_t*& out, uint32_t const* out_end) { switch (format_) { case u::ReaderInputFormat::DETECT: if (detect(in, in_end, in_eof, format_)) { return operator()(in, in_end, in_eof, out, out_end); } return u::ReadError::Incomplete; case u::ReaderInputFormat::UTF8: return u8_reader_(in, in_end, in_eof, out, out_end); case kU16NativeInputFormat: return u16_native_reader_(in, in_end, in_eof, out, out_end); case kU16SwapInputFormat: return u16_swap_reader_(in, in_end, in_eof, out, out_end); } std::unreachable(); } private: u::ReaderInputFormat format_{u::ReaderInputFormat::DETECT}; U8Reader u8_reader_; U16NativeReader u16_native_reader_; U16SwapReader u16_swap_reader_; }; struct U8Writer { bool operator()(uint32_t const*& in, uint32_t const* in_end, void*& out, size_t out_avail) { auto* it = reinterpret_cast(out); auto* const end = it + out_avail; bool ret = true; while (in < in_end) { if (!u8::write(it, end, *in)) { ret = false; break; } ++in; } out = it; return ret; } }; struct U16NativeWriter { public: bool operator()(uint32_t const*& in, uint32_t const* in_end, void*& out, size_t out_avail) { auto* it = reinterpret_cast(out); auto* const end = it + (out_avail / 2); bool ret = true; while (in < in_end) { if (!u16::write(it, end, *in)) { ret = false; break; } ++in; } out = it; return ret; } }; /* struct U16SwapWriter { bool operator()(uint32_t const* &in, uint32_t const* in_end, void* &out, size_t out_avail) { auto* it = reinterpret_cast(out); auto* const end = it + (out_avail / 2); bool ret = true; while (in < in_end) { auto tmp = it; if (!u16::write(it, end, *in)) { ret = false; break; } ++in; *tmp = std::byteswap(*tmp); if (++tmp != it) *tmp = std::byteswap(*tmp); } out = it; return ret; } }; */ } // namespace namespace u8 { namespace { template class UnicodeReaderU8Writer : public UnicodeReader, public virtual Reader { public: UnicodeReaderU8Writer(std::unique_ptr in, u::ReaderConfig config) : UnicodeReader(std::move(in), config) {} std::expected read(void* dst, size_t max) override { return UnicodeReader::read(dst, max); } std::expected skip(size_t max) override { return UnicodeReader::skip(max); } }; } // namespace std::expected Reader::read(std::string& data, size_t max) { if (max > data.size()) data.resize(max); auto ret = read(data.data(), max); if (ret.has_value()) { data.resize(ret.value()); } return ret; } std::expected Reader::repeat_read(std::string& data, size_t max) { if (max > data.size()) data.resize(max); auto ret = repeat_read(data.data(), max); if (ret.has_value()) { data.resize(ret.value()); } return ret; } std::unique_ptr open(std::unique_ptr reader, u::ReaderConfig config) { switch (config.input) { case u::ReaderInputFormat::UTF8: if (config.strict) return std::make_unique>( std::move(reader), config); return std::make_unique>( std::move(reader), config); case kU16NativeInputFormat: if (config.strict) return std::make_unique>( std::move(reader), config); return std::make_unique>( std::move(reader), config); break; case kU16SwapInputFormat: if (config.strict) return std::make_unique>( std::move(reader), config); return std::make_unique>( std::move(reader), config); break; case u::ReaderInputFormat::DETECT: if (config.strict) return std::make_unique>( std::move(reader), config); return std::make_unique>( std::move(reader), config); break; } std::unreachable(); } std::expected, io::OpenError> open( const std::string& file_path, u::ReaderConfig config) { auto ret = io::open(file_path); if (ret.has_value()) return open(std::move(ret.value()), config); return std::unexpected(ret.error()); } std::expected, io::OpenError> openat( int dirfd, const std::string& file_path, u::ReaderConfig config) { auto ret = io::openat(dirfd, file_path); if (ret.has_value()) return open(std::move(ret.value()), config); return std::unexpected(ret.error()); } } // namespace u8 namespace u16 { namespace { template class UnicodeReaderU16NativeWriter : public UnicodeReader, public virtual Reader { public: UnicodeReaderU16NativeWriter(std::unique_ptr in, u::ReaderConfig config) : UnicodeReader(std::move(in), config) {} std::expected read(void* dst, size_t max) override { return UnicodeReader::read(dst, max); } std::expected skip(size_t max) override { return UnicodeReader::skip(max); } }; } // namespace std::expected Reader::read(std::u16string& data, size_t max) { if (max > data.size()) data.resize(max); auto ret = read(data.data(), max * 2); if (ret.has_value()) { data.resize(ret.value()); return ret.value() / 2; } return ret; } std::expected Reader::repeat_read(std::u16string& data, size_t max) { if (max > data.size()) data.resize(max); auto ret = repeat_read(data.data(), max * 2); if (ret.has_value()) { data.resize(ret.value()); return ret.value() / 2; } return ret; } std::unique_ptr open(std::unique_ptr reader, u::ReaderConfig config) { switch (config.input) { case u::ReaderInputFormat::UTF8: if (config.strict) return std::make_unique>( std::move(reader), config); return std::make_unique>( std::move(reader), config); case kU16NativeInputFormat: if (config.strict) return std::make_unique< UnicodeReaderU16NativeWriter>( std::move(reader), config); return std::make_unique>( std::move(reader), config); break; case kU16SwapInputFormat: if (config.strict) return std::make_unique< UnicodeReaderU16NativeWriter>( std::move(reader), config); return std::make_unique>( std::move(reader), config); break; case u::ReaderInputFormat::DETECT: if (config.strict) return std::make_unique< UnicodeReaderU16NativeWriter>(std::move(reader), config); return std::make_unique>( std::move(reader), config); break; } std::unreachable(); } std::expected, io::OpenError> open( const std::string& file_path, u::ReaderConfig config) { auto ret = io::open(file_path); if (ret.has_value()) return open(std::move(ret.value()), config); return std::unexpected(ret.error()); } std::expected, io::OpenError> openat( int dirfd, const std::string& file_path, u::ReaderConfig config) { auto ret = io::openat(dirfd, file_path); if (ret.has_value()) return open(std::move(ret.value()), config); return std::unexpected(ret.error()); } } // namespace u16