diff options
| author | Joel Klinghed <the_jk@spawned.biz> | 2025-09-15 20:52:51 +0200 |
|---|---|---|
| committer | Joel Klinghed <the_jk@spawned.biz> | 2025-09-15 20:52:51 +0200 |
| commit | 18a622f378b403788c67fc785d30f4609caa3fc7 (patch) | |
| tree | 9d13f4ef49a06c9e4837487f61bc90b734ad9b9a /src/uio.hh | |
| parent | 28c6425e4ed1cd2eab538e7cba08c18aa83d8af5 (diff) | |
uio: Unicode reader
Reads UTF-8 and UTF-16 into UTF-8 or UTF-16 strings.
If strict is true, fails at first invalid character.
If strict is false, invalid characters are replaced with U+FFFD.
For the replacement, I changed behavior if uN::read_replace to only
jump one byte. Otherwise a common invalid case when ISO-8859-1 or
WIN-1252 are read as UTF-8 would skip many characters.
If skip_bom is true any bom at start of stream is ignored.
If skip_bom is false any bom will be included.
Input format can be forced, if not detect is used which will
try to guess and then fallback to UTF-8.
Diffstat (limited to 'src/uio.hh')
| -rw-r--r-- | src/uio.hh | 78 |
1 files changed, 78 insertions, 0 deletions
diff --git a/src/uio.hh b/src/uio.hh new file mode 100644 index 0000000..a0911a1 --- /dev/null +++ b/src/uio.hh @@ -0,0 +1,78 @@ +#ifndef UIO_HH +#define UIO_HH + +#include "io.hh" // IWYU pragma: export + +#include <cstddef> +#include <expected> +#include <string> + +namespace u { + +enum class ReaderInputFormat { + UTF8, + UTF16_BE, + UTF16_LE, + DETECT, +}; + +struct ReaderConfig { + // If false (default), invalid data is replaced with U+FFFD + bool strict{false}; + // Input format + ReaderInputFormat input{ReaderInputFormat::DETECT}; + // If true (default), any BOM found at start of stream will be skipped + bool skip_bom{true}; +}; + +} // namespace u8 + +namespace u8 { + +class Reader : public io::Reader { + public: + using io::Reader::read; + using io::Reader::repeat_read; + + [[nodiscard]] std::expected<size_t, io::ReadError> read( + std::string& data, size_t max); + + [[nodiscard]] std::expected<size_t, io::ReadError> repeat_read( + std::string& data, size_t max); +}; + +[[nodiscard]] std::unique_ptr<Reader> open( + std::unique_ptr<io::Reader> reader, u::ReaderConfig config = {}); + +[[nodiscard]] std::expected<std::unique_ptr<Reader>, io::OpenError> open( + const std::string& file_path, u::ReaderConfig config = {}); +[[nodiscard]] std::expected<std::unique_ptr<Reader>, io::OpenError> openat( + int dirfd, const std::string& file_path, u::ReaderConfig config = {}); + +} // namespace u8 + +namespace u16 { + +class Reader : public io::Reader { + public: + using io::Reader::read; + using io::Reader::repeat_read; + + [[nodiscard]] std::expected<size_t, io::ReadError> read( + std::u16string& data, size_t max); + + [[nodiscard]] std::expected<size_t, io::ReadError> repeat_read( + std::u16string& data, size_t max); +}; + +[[nodiscard]] std::unique_ptr<Reader> open( + std::unique_ptr<io::Reader> reader, u::ReaderConfig config = {}); + +[[nodiscard]] std::expected<std::unique_ptr<Reader>, io::OpenError> open( + const std::string& file_path, u::ReaderConfig config = {}); +[[nodiscard]] std::expected<std::unique_ptr<Reader>, io::OpenError> openat( + int dirfd, const std::string& file_path, u::ReaderConfig config = {}); + +} // namespace u16 + +#endif // UIO_HH |
