From 18a622f378b403788c67fc785d30f4609caa3fc7 Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Mon, 15 Sep 2025 20:52:51 +0200 Subject: uio: Unicode reader Reads UTF-8 and UTF-16 into UTF-8 or UTF-16 strings. If strict is true, fails at first invalid character. If strict is false, invalid characters are replaced with U+FFFD. For the replacement, I changed behavior if uN::read_replace to only jump one byte. Otherwise a common invalid case when ISO-8859-1 or WIN-1252 are read as UTF-8 would skip many characters. If skip_bom is true any bom at start of stream is ignored. If skip_bom is false any bom will be included. Input format can be forced, if not detect is used which will try to guess and then fallback to UTF-8. --- src/umod8.hh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'src/umod8.hh') diff --git a/src/umod8.hh b/src/umod8.hh index 117591f..b91b199 100644 --- a/src/umod8.hh +++ b/src/umod8.hh @@ -113,19 +113,24 @@ std::expected read(T& start, const T& end) { template requires std::is_same_v, uint8_t> std::expected read_replace(T& start, - const T& end) { + const T& end, + bool eof) { + auto const tmp = start; auto ret = read(start, end); if (ret.has_value()) return *ret; switch (ret.error()) { case u::ReadError::Incomplete: + if (eof) + break; return std::unexpected(u::ReadErrorReplace::Incomplete); case u::ReadError::End: return std::unexpected(u::ReadErrorReplace::End); case u::ReadError::Invalid: - return 0xfffd; + break; } - std::unreachable(); + start = tmp + 1; + return 0xfffd; } template -- cgit v1.3