From 18a622f378b403788c67fc785d30f4609caa3fc7 Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Mon, 15 Sep 2025 20:52:51 +0200 Subject: uio: Unicode reader Reads UTF-8 and UTF-16 into UTF-8 or UTF-16 strings. If strict is true, fails at first invalid character. If strict is false, invalid characters are replaced with U+FFFD. For the replacement, I changed behavior if uN::read_replace to only jump one byte. Otherwise a common invalid case when ISO-8859-1 or WIN-1252 are read as UTF-8 would skip many characters. If skip_bom is true any bom at start of stream is ignored. If skip_bom is false any bom will be included. Input format can be forced, if not detect is used which will try to guess and then fallback to UTF-8. --- src/uio.hh | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 src/uio.hh (limited to 'src/uio.hh') diff --git a/src/uio.hh b/src/uio.hh new file mode 100644 index 0000000..a0911a1 --- /dev/null +++ b/src/uio.hh @@ -0,0 +1,78 @@ +#ifndef UIO_HH +#define UIO_HH + +#include "io.hh" // IWYU pragma: export + +#include +#include +#include + +namespace u { + +enum class ReaderInputFormat { + UTF8, + UTF16_BE, + UTF16_LE, + DETECT, +}; + +struct ReaderConfig { + // If false (default), invalid data is replaced with U+FFFD + bool strict{false}; + // Input format + ReaderInputFormat input{ReaderInputFormat::DETECT}; + // If true (default), any BOM found at start of stream will be skipped + bool skip_bom{true}; +}; + +} // namespace u8 + +namespace u8 { + +class Reader : public io::Reader { + public: + using io::Reader::read; + using io::Reader::repeat_read; + + [[nodiscard]] std::expected read( + std::string& data, size_t max); + + [[nodiscard]] std::expected repeat_read( + std::string& data, size_t max); +}; + +[[nodiscard]] std::unique_ptr open( + std::unique_ptr reader, u::ReaderConfig config = {}); + +[[nodiscard]] std::expected, io::OpenError> open( + const std::string& file_path, u::ReaderConfig config = {}); +[[nodiscard]] std::expected, io::OpenError> openat( + int dirfd, const std::string& file_path, u::ReaderConfig config = {}); + +} // namespace u8 + +namespace u16 { + +class Reader : public io::Reader { + public: + using io::Reader::read; + using io::Reader::repeat_read; + + [[nodiscard]] std::expected read( + std::u16string& data, size_t max); + + [[nodiscard]] std::expected repeat_read( + std::u16string& data, size_t max); +}; + +[[nodiscard]] std::unique_ptr open( + std::unique_ptr reader, u::ReaderConfig config = {}); + +[[nodiscard]] std::expected, io::OpenError> open( + const std::string& file_path, u::ReaderConfig config = {}); +[[nodiscard]] std::expected, io::OpenError> openat( + int dirfd, const std::string& file_path, u::ReaderConfig config = {}); + +} // namespace u16 + +#endif // UIO_HH -- cgit v1.3