uio: Unicode reader

Reads UTF-8 and UTF-16 into UTF-8 or UTF-16 strings. If strict is true, fails at first invalid character. If strict is false, invalid characters are replaced with U+FFFD. For the replacement, I changed behavior if uN::read_replace to only jump one byte. Otherwise a common invalid case when ISO-8859-1 or WIN-1252 are read as UTF-8 would skip many characters. If skip_bom is true any bom at start of stream is ignored. If skip_bom is false any bom will be included. Input format can be forced, if not detect is used which will try to guess and then fallback to UTF-8.
author: Joel Klinghed <the_jk@spawned.biz> 2025-09-15 20:52:51 +0200
committer: Joel Klinghed <the_jk@spawned.biz> 2025-09-15 20:52:51 +0200
commit: 18a622f378b403788c67fc785d30f4609caa3fc7 (patch)
tree: 9d13f4ef49a06c9e4837487f61bc90b734ad9b9a /src/u8.hh
parent: 28c6425e4ed1cd2eab538e7cba08c18aa83d8af5 (diff)
1 files changed, 8 insertions, 3 deletions
diff --git a/src/u8.hh b/src/u8.hh
index 3c1d19e..b89f80f 100644
--- a/src/u8.hh
+++ b/src/u8.hh
@@ -105,19 +105,24 @@ std::expected<uint32_t, u::ReadError> read(T& start, const T& end) {
 template<std::forward_iterator T>
   requires std::is_same_v<std::iter_value_t<T>, uint8_t>
 std::expected<uint32_t, u::ReadErrorReplace> read_replace(T& start,
-                                                          const T& end) {
+                                                          const T& end,
+                                                          bool eof) {
+  auto const tmp = start;
   auto ret = read(start, end);
   if (ret.has_value())
     return *ret;
   switch (ret.error()) {
     case u::ReadError::Incomplete:
+      if (eof)
+        break;
       return std::unexpected(u::ReadErrorReplace::Incomplete);
     case u::ReadError::End:
       return std::unexpected(u::ReadErrorReplace::End);
     case u::ReadError::Invalid:
-      return 0xfffd;
+      break;
   }
-  std::unreachable();
+  start = tmp + 1;
+  return 0xfffd;
 }
 
 template<std::forward_iterator T>
author	Joel Klinghed <the_jk@spawned.biz>	2025-09-15 20:52:51 +0200
committer	Joel Klinghed <the_jk@spawned.biz>	2025-09-15 20:52:51 +0200
commit	18a622f378b403788c67fc785d30f4609caa3fc7 (patch)
tree	9d13f4ef49a06c9e4837487f61bc90b734ad9b9a /src/u8.hh
parent	28c6425e4ed1cd2eab538e7cba08c18aa83d8af5 (diff)