diff options
| author | Joel Klinghed <the_jk@spawned.biz> | 2025-09-15 20:52:51 +0200 |
|---|---|---|
| committer | Joel Klinghed <the_jk@spawned.biz> | 2025-09-15 20:52:51 +0200 |
| commit | 18a622f378b403788c67fc785d30f4609caa3fc7 (patch) | |
| tree | 9d13f4ef49a06c9e4837487f61bc90b734ad9b9a /src/uio.cc | |
| parent | 28c6425e4ed1cd2eab538e7cba08c18aa83d8af5 (diff) | |
uio: Unicode reader
Reads UTF-8 and UTF-16 into UTF-8 or UTF-16 strings.
If strict is true, fails at first invalid character.
If strict is false, invalid characters are replaced with U+FFFD.
For the replacement, I changed behavior if uN::read_replace to only
jump one byte. Otherwise a common invalid case when ISO-8859-1 or
WIN-1252 are read as UTF-8 would skip many characters.
If skip_bom is true any bom at start of stream is ignored.
If skip_bom is false any bom will be included.
Input format can be forced, if not detect is used which will
try to guess and then fallback to UTF-8.
Diffstat (limited to 'src/uio.cc')
| -rw-r--r-- | src/uio.cc | 700 |
1 files changed, 700 insertions, 0 deletions
diff --git a/src/uio.cc b/src/uio.cc new file mode 100644 index 0000000..1bf5e40 --- /dev/null +++ b/src/uio.cc @@ -0,0 +1,700 @@ +#include "uio.hh" + +#include "buffer.hh" +#include "u8.hh" +#include "u16.hh" + +#include <bit> +#include <cassert> +#include <cstring> +#include <expected> +#include <memory> +#include <optional> +#include <string> +#include <utility> + +namespace { + +constexpr u::ReaderInputFormat kU16NativeInputFormat = + (std::endian::native == std::endian::big) + ? u::ReaderInputFormat::UTF16_BE + : u::ReaderInputFormat::UTF16_LE; + +constexpr u::ReaderInputFormat kU16SwapInputFormat = + (std::endian::native == std::endian::big) + ? u::ReaderInputFormat::UTF16_LE + : u::ReaderInputFormat::UTF16_BE; + +constexpr size_t kByteBufferSize = 65535; +constexpr size_t kUnicodeBufferSize = 8192; +constexpr size_t kUSwapBufferSize = kByteBufferSize / 4; + +template<typename UReader, typename UWriter> +class UnicodeReader : public io::Reader { + public: + UnicodeReader(std::unique_ptr<io::Reader> in, u::ReaderConfig config) + : in_(std::move(in)), config_(config), skip_bom_(config_.skip_bom) {} + + std::expected<size_t, io::ReadError> read(void* dst, size_t max) override { + auto err = fill(); + if (err.has_value()) + return std::unexpected(err.value()); + + auto* in = reinterpret_cast<uint8_t const*>(in_ptr_); + auto read_err = reader_(in, in + in_avail_, in_eof_, + u_buffer_wptr_, + u_buffer_ + kUnicodeBufferSize); + byte_buffer_->consume(in - reinterpret_cast<uint8_t const*>(in_ptr_)); + if (read_err.has_value()) { + switch (read_err.value()) { + case u::ReadError::Invalid: + // Only return error if we have no bytes to output. + if (u_buffer_wptr_ == u_buffer_) + return std::unexpected(io::ReadError::InvalidData); + break; + case u::ReadError::End: + break; + case u::ReadError::Incomplete: + if (in_eof_) { + // Only return error if we have no bytes to output. + if (u_buffer_wptr_ == u_buffer_) + return std::unexpected(io::ReadError::InvalidData); + } else { + // We clearly need more data, call fill again. + if (u_buffer_wptr_ == u_buffer_) + return read(dst, max); + } + break; + } + } + + if (skip_bom_ && u_buffer_wptr_ > u_buffer_) { + if (u_buffer_[0] == 0xfeff) { + --u_buffer_wptr_; + memmove(u_buffer_, u_buffer_ + 1, + (u_buffer_wptr_ - u_buffer_) * sizeof(uint32_t)); + } + skip_bom_ = false; + } + + auto* u_out = const_cast<uint32_t const*>(u_buffer_); + auto* d_out = dst; + if (writer_(u_out, u_buffer_wptr_, d_out, max)) { + assert(u_out == u_buffer_wptr_); + u_buffer_wptr_ = u_buffer_; + } else if (u_out == u_buffer_) { + // Unable to write anything. + if (max == 0) return 0; + return std::unexpected(io::ReadError::MaxTooSmall); + } else { + size_t left = u_buffer_wptr_ - u_out; + memmove(u_buffer_, u_out, left * sizeof(uint32_t)); + u_buffer_wptr_ = u_buffer_ + left; + } + + return reinterpret_cast<char*>(d_out) - reinterpret_cast<char*>(dst); + } + + std::expected<size_t, io::ReadError> skip(size_t max) override { + auto tmp = std::make_unique_for_overwrite<char[]>(max); + return read(tmp.get(), max); + } + + private: + std::optional<io::ReadError> fill() { + in_ptr_ = byte_buffer_->rptr(in_avail_); + if (!in_eof_ && in_avail_ < kByteBufferSize / 2) { + auto* wptr = byte_buffer_->wptr(in_avail_); + auto got = in_->read(wptr, in_avail_); + if (got.has_value()) { + byte_buffer_->commit(got.value()); + if (got.value() == 0) + in_eof_ = true; + } else { + return got.error(); + } + in_ptr_ = byte_buffer_->rptr(in_avail_); + } + return std::nullopt; + } + + std::unique_ptr<io::Reader> in_; + u::ReaderConfig const config_; + UReader reader_; + UWriter writer_; + bool skip_bom_; + void const* in_ptr_{nullptr}; + size_t in_avail_{0}; + bool in_eof_{false}; + std::unique_ptr<Buffer> byte_buffer_{Buffer::fixed(kByteBufferSize)}; + uint32_t u_buffer_[kUnicodeBufferSize]; + uint32_t* u_buffer_wptr_{u_buffer_}; +}; + +struct U8ReaderStrict { + std::optional<u::ReadError> operator()(uint8_t const* &in, + uint8_t const* in_end, + bool /* in_eof */, + uint32_t* &out, + uint32_t const* out_end) { + std::optional<u::ReadError> ret; + while (out < out_end) { + auto const tmp = in; + auto c = u8::read(in, in_end); + if (c.has_value()) { + *(out++) = c.value(); + } else { + ret = c.error(); + in = tmp; + break; + } + } + return ret; + } +}; + +struct U8Reader { + std::optional<u::ReadError> operator()(uint8_t const* &in, + uint8_t const* in_end, + bool in_eof, uint32_t* &out, + uint32_t const* out_end) { + std::optional<u::ReadError> ret; + while (out < out_end) { + auto c = u8::read_replace(in, in_end, in_eof); + if (c.has_value()) { + *(out++) = c.value(); + } else { + switch (c.error()) { + case u::ReadErrorReplace::End: + ret = u::ReadError::End; + break; + case u::ReadErrorReplace::Incomplete: + ret = u::ReadError::Incomplete; + break; + } + break; + } + } + return ret; + } +}; + +struct U16NativeReaderStrict { + std::optional<u::ReadError> operator()(uint8_t const* &in, + uint8_t const* in_end, + bool /* in_eof */, + uint32_t* &out, + uint32_t const* out_end) { + auto* it = reinterpret_cast<uint16_t const*>(in); + auto* const end = it + ((in_end - in) / 2); + if (it == end && in < in_end) + return u::ReadError::Incomplete; + std::optional<u::ReadError> ret; + while (out < out_end) { + auto const tmp = in; + auto c = u16::read(it, end); + if (c.has_value()) { + *(out++) = c.value(); + } else { + ret = c.error(); + in = tmp; + break; + } + } + in = reinterpret_cast<uint8_t const*>(it); + return ret; + } +}; + +struct U16SwapReaderStrict { + std::optional<u::ReadError> operator()(uint8_t const* &in, + uint8_t const* in_end, + bool /* in_eof */, + uint32_t* &out, + uint32_t const* out_end) { + auto* it = buffer_; + auto* const end = it + ((in_end - in) / 2); + if (it == end && in < in_end) + return u::ReadError::Incomplete; + { + auto* in2 = reinterpret_cast<uint16_t const*>(in); + for (auto* it2 = it; it2 != end; ++it2) *it2 = std::byteswap(*(in2++)); + } + std::optional<u::ReadError> ret; + while (out < out_end) { + auto const tmp = in; + auto c = u16::read(it, end); + if (c.has_value()) { + *(out++) = c.value(); + } else { + ret = c.error(); + in = tmp; + break; + } + } + in += (it - buffer_) * 2; + return ret; + } + + private: + uint16_t buffer_[kUSwapBufferSize]; +}; + +struct U16NativeReader { + std::optional<u::ReadError> operator()(uint8_t const* &in, + uint8_t const* in_end, + bool in_eof, + uint32_t* &out, + uint32_t const* out_end) { + auto* it = reinterpret_cast<uint16_t const*>(in); + auto* const end = it + ((in_end - in) / 2); + if (it == end && in < in_end) { + if (out == out_end) + return std::nullopt; + if (in_eof) { + *(out++) = 0xfffd; + in = in_end; + return std::nullopt; + } + return u::ReadError::Incomplete; + } + std::optional<u::ReadError> ret; + while (out < out_end) { + auto c = u16::read_replace(it, end, in_eof); + if (c.has_value()) { + *(out++) = c.value(); + } else { + switch (c.error()) { + case u::ReadErrorReplace::End: + ret = u::ReadError::End; + break; + case u::ReadErrorReplace::Incomplete: + ret = u::ReadError::Incomplete; + break; + } + break; + } + } + in = reinterpret_cast<uint8_t const*>(it); + return ret; + } +}; + +struct U16SwapReader { + std::optional<u::ReadError> operator()(uint8_t const* &in, + uint8_t const* in_end, + bool in_eof, + uint32_t* &out, + uint32_t const* out_end) { + auto* it = buffer_; + auto* const end = it + ((in_end - in) / 2); + if (it == end && in < in_end) { + if (out == out_end) + return std::nullopt; + if (in_eof) { + *(out++) = 0xfffd; + in = in_end; + return std::nullopt; + } + return u::ReadError::Incomplete; + } + { + auto* in2 = reinterpret_cast<uint16_t const*>(in); + for (auto* it2 = it; it2 != end; ++it2) *it2 = std::byteswap(*(in2++)); + } + std::optional<u::ReadError> ret; + while (out < out_end) { + auto c = u16::read_replace(it, end, in_eof); + if (c.has_value()) { + *(out++) = c.value(); + } else { + switch (c.error()) { + case u::ReadErrorReplace::End: + ret = u::ReadError::End; + break; + case u::ReadErrorReplace::Incomplete: + ret = u::ReadError::Incomplete; + break; + } + break; + } + } + in += (it - buffer_) * 2; + return ret; + } + + private: + uint16_t buffer_[kUSwapBufferSize]; +}; + +bool detect(uint8_t const* in, uint8_t const* in_end, bool in_eof, + u::ReaderInputFormat &format) { + if (in == in_end) { + if (in_eof) { + // Doesn't matter, go with UTF-8 just to get out of "detect" + format = u::ReaderInputFormat::UTF8; + return true; + } + return false; + } + + // UTF-8 BOM ? + if (in_end - in >= 3) { + if (in[0] == 0xef && in[1] == 0xbb && in[2] == 0xbf) { + format = u::ReaderInputFormat::UTF8; + return true; + } + } + + // UTF-16 BOM ? + if (in_end - in >= 2) { + auto* data = reinterpret_cast<uint16_t const*>(in); + if (data[0] == 0xFEFF) { + format = kU16NativeInputFormat; + return true; + } + if (data[0] == 0xFFFE) { + format = kU16SwapInputFormat; + return true; + } + } + + // Check for zero bytes, not allowed in UTF-8 and likely for UTF-16 + // encoding western characters. + if (in_end - in >= 2) { + if (in[0] == 0x00 && in[1] != 0x00) { + format = u::ReaderInputFormat::UTF16_BE; + return true; + } + if (in[0] != 0x00 && in[1] == 0x00) { + format = u::ReaderInputFormat::UTF16_LE; + return true; + } + } + + if (in_end - in >= 2 || in_eof) { + // We have no idea what it is, fallback to UTF-8 and let it "handle" + // whatever the input data actually is. + format = u::ReaderInputFormat::UTF8; + return true; + } + return false; +} + +struct DetectReaderStrict { + std::optional<u::ReadError> operator()(uint8_t const* &in, + uint8_t const* in_end, + bool in_eof, + uint32_t* &out, + uint32_t const* out_end) { + switch (format_) { + case u::ReaderInputFormat::DETECT: + if (detect(in, in_end, in_eof, format_)) { + return operator()(in, in_end, in_eof, out, out_end); + } + return u::ReadError::Incomplete; + case u::ReaderInputFormat::UTF8: + return u8_reader_(in, in_end, in_eof, out, out_end); + case kU16NativeInputFormat: + return u16_native_reader_(in, in_end, in_eof, out, out_end); + case kU16SwapInputFormat: + return u16_swap_reader_(in, in_end, in_eof, out, out_end); + } + std::unreachable(); + } + + private: + u::ReaderInputFormat format_{u::ReaderInputFormat::DETECT}; + U8ReaderStrict u8_reader_; + U16NativeReaderStrict u16_native_reader_; + U16SwapReaderStrict u16_swap_reader_; +}; + +struct DetectReader { + std::optional<u::ReadError> operator()(uint8_t const* &in, + uint8_t const* in_end, + bool in_eof, + uint32_t* &out, + uint32_t const* out_end) { + switch (format_) { + case u::ReaderInputFormat::DETECT: + if (detect(in, in_end, in_eof, format_)) { + return operator()(in, in_end, in_eof, out, out_end); + } + return u::ReadError::Incomplete; + case u::ReaderInputFormat::UTF8: + return u8_reader_(in, in_end, in_eof, out, out_end); + case kU16NativeInputFormat: + return u16_native_reader_(in, in_end, in_eof, out, out_end); + case kU16SwapInputFormat: + return u16_swap_reader_(in, in_end, in_eof, out, out_end); + } + std::unreachable(); + } + + private: + u::ReaderInputFormat format_{u::ReaderInputFormat::DETECT}; + U8Reader u8_reader_; + U16NativeReader u16_native_reader_; + U16SwapReader u16_swap_reader_; +}; + +struct U8Writer { + bool operator()(uint32_t const* &in, uint32_t const* in_end, + void* &out, size_t out_avail) { + auto* it = reinterpret_cast<uint8_t*>(out); + auto* const end = it + out_avail; + bool ret = true; + while (in < in_end) { + if (!u8::write(it, end, *in)) { + ret = false; + break; + } + ++in; + } + out = it; + return ret; + } +}; + +struct U16NativeWriter { + public: + bool operator()(uint32_t const* &in, uint32_t const* in_end, + void* &out, size_t out_avail) { + auto* it = reinterpret_cast<uint16_t*>(out); + auto* const end = it + (out_avail / 2); + bool ret = true; + while (in < in_end) { + if (!u16::write(it, end, *in)) { + ret = false; + break; + } + ++in; + } + out = it; + return ret; + } +}; + +/* +struct U16SwapWriter { + bool operator()(uint32_t const* &in, uint32_t const* in_end, + void* &out, size_t out_avail) { + auto* it = reinterpret_cast<uint16_t*>(out); + auto* const end = it + (out_avail / 2); + bool ret = true; + while (in < in_end) { + auto tmp = it; + if (!u16::write(it, end, *in)) { + ret = false; + break; + } + ++in; + *tmp = std::byteswap(*tmp); + if (++tmp != it) *tmp = std::byteswap(*tmp); + } + out = it; + return ret; + } +}; +*/ + +} // namespace + +namespace u8 { + +namespace { + +template<typename UReader> +class UnicodeReaderU8Writer : public UnicodeReader<UReader, U8Writer>, + public virtual Reader { + public: + UnicodeReaderU8Writer(std::unique_ptr<io::Reader> in, + u::ReaderConfig config) + : UnicodeReader<UReader, U8Writer>(std::move(in), config) {} + + std::expected<size_t, io::ReadError> read(void* dst, size_t max) override { + return UnicodeReader<UReader, U8Writer>::read(dst, max); + } + + std::expected<size_t, io::ReadError> skip(size_t max) override { + return UnicodeReader<UReader, U8Writer>::skip(max); + } +}; + +} // namespace + + +std::expected<size_t, io::ReadError> Reader::read(std::string& data, + size_t max) { + if (max > data.size()) + data.resize(max); + auto ret = read(data.data(), max); + if (ret.has_value()) { + data.resize(ret.value()); + } + return ret; +} + +std::expected<size_t, io::ReadError> Reader::repeat_read( + std::string& data, size_t max) { + if (max > data.size()) + data.resize(max); + auto ret = repeat_read(data.data(), max); + if (ret.has_value()) { + data.resize(ret.value()); + } + return ret; +} + +std::unique_ptr<Reader> open( + std::unique_ptr<io::Reader> reader, u::ReaderConfig config) { + switch (config.input) { + case u::ReaderInputFormat::UTF8: + if (config.strict) + return std::make_unique<UnicodeReaderU8Writer<U8ReaderStrict>>( + std::move(reader), config); + return std::make_unique<UnicodeReaderU8Writer<U8Reader>>( + std::move(reader), config); + case kU16NativeInputFormat: + if (config.strict) + return std::make_unique<UnicodeReaderU8Writer<U16NativeReaderStrict>>( + std::move(reader), config); + return std::make_unique<UnicodeReaderU8Writer<U16NativeReader>>( + std::move(reader), config); + break; + case kU16SwapInputFormat: + if (config.strict) + return std::make_unique<UnicodeReaderU8Writer<U16SwapReaderStrict>>( + std::move(reader), config); + return std::make_unique<UnicodeReaderU8Writer<U16SwapReader>>( + std::move(reader), config); + break; + case u::ReaderInputFormat::DETECT: + if (config.strict) + return std::make_unique<UnicodeReaderU8Writer<DetectReaderStrict>>( + std::move(reader), config); + return std::make_unique<UnicodeReaderU8Writer<DetectReader>>( + std::move(reader), config); + break; + } + std::unreachable(); +} + +std::expected<std::unique_ptr<Reader>, io::OpenError> open( + const std::string& file_path, u::ReaderConfig config) { + auto ret = io::open(file_path); + if (ret.has_value()) + return open(std::move(ret.value()), config); + return std::unexpected(ret.error()); +} + +std::expected<std::unique_ptr<Reader>, io::OpenError> openat( + int dirfd, const std::string& file_path, u::ReaderConfig config) { + auto ret = io::openat(dirfd, file_path); + if (ret.has_value()) + return open(std::move(ret.value()), config); + return std::unexpected(ret.error()); +} + +} // namespace u8 + +namespace u16 { + +namespace { + +template<typename UReader> +class UnicodeReaderU16NativeWriter : public UnicodeReader<UReader, + U16NativeWriter>, + public virtual Reader { + public: + UnicodeReaderU16NativeWriter(std::unique_ptr<io::Reader> in, + u::ReaderConfig config) + : UnicodeReader<UReader, U16NativeWriter>(std::move(in), config) {} + + std::expected<size_t, io::ReadError> read(void* dst, size_t max) override { + return UnicodeReader<UReader, U16NativeWriter>::read(dst, max); + } + + std::expected<size_t, io::ReadError> skip(size_t max) override { + return UnicodeReader<UReader, U16NativeWriter>::skip(max); + } +}; + +} // namespace + +std::expected<size_t, io::ReadError> Reader::read(std::u16string& data, + size_t max) { + if (max > data.size()) + data.resize(max); + auto ret = read(data.data(), max * 2); + if (ret.has_value()) { + data.resize(ret.value()); + return ret.value() / 2; + } + return ret; +} + +std::expected<size_t, io::ReadError> Reader::repeat_read( + std::u16string& data, size_t max) { + if (max > data.size()) + data.resize(max); + auto ret = repeat_read(data.data(), max * 2); + if (ret.has_value()) { + data.resize(ret.value()); + return ret.value() / 2; + } + return ret; +} + +std::unique_ptr<Reader> open( + std::unique_ptr<io::Reader> reader, u::ReaderConfig config) { + switch (config.input) { + case u::ReaderInputFormat::UTF8: + if (config.strict) + return std::make_unique<UnicodeReaderU16NativeWriter<U8ReaderStrict>>( + std::move(reader), config); + return std::make_unique<UnicodeReaderU16NativeWriter<U8Reader>>( + std::move(reader), config); + case kU16NativeInputFormat: + if (config.strict) + return std::make_unique<UnicodeReaderU16NativeWriter< + U16NativeReaderStrict>>(std::move(reader), config); + return std::make_unique<UnicodeReaderU16NativeWriter<U16NativeReader>>( + std::move(reader), config); + break; + case kU16SwapInputFormat: + if (config.strict) + return std::make_unique<UnicodeReaderU16NativeWriter< + U16SwapReaderStrict>>(std::move(reader), config); + return std::make_unique<UnicodeReaderU16NativeWriter<U16SwapReader>>( + std::move(reader), config); + break; + case u::ReaderInputFormat::DETECT: + if (config.strict) + return std::make_unique<UnicodeReaderU16NativeWriter< + DetectReaderStrict>>(std::move(reader), config); + return std::make_unique<UnicodeReaderU16NativeWriter<DetectReader>>( + std::move(reader), config); + break; + } + std::unreachable(); +} + +std::expected<std::unique_ptr<Reader>, io::OpenError> open( + const std::string& file_path, u::ReaderConfig config) { + auto ret = io::open(file_path); + if (ret.has_value()) + return open(std::move(ret.value()), config); + return std::unexpected(ret.error()); +} + +std::expected<std::unique_ptr<Reader>, io::OpenError> openat( + int dirfd, const std::string& file_path, u::ReaderConfig config) { + auto ret = io::openat(dirfd, file_path); + if (ret.has_value()) + return open(std::move(ret.value()), config); + return std::unexpected(ret.error()); +} + +} // namespace u16 |
