summaryrefslogtreecommitdiff
path: root/src/u8.hh
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@spawned.biz>2025-09-15 20:52:51 +0200
committerJoel Klinghed <the_jk@spawned.biz>2025-09-15 20:52:51 +0200
commit18a622f378b403788c67fc785d30f4609caa3fc7 (patch)
tree9d13f4ef49a06c9e4837487f61bc90b734ad9b9a /src/u8.hh
parent28c6425e4ed1cd2eab538e7cba08c18aa83d8af5 (diff)
uio: Unicode reader
Reads UTF-8 and UTF-16 into UTF-8 or UTF-16 strings. If strict is true, fails at first invalid character. If strict is false, invalid characters are replaced with U+FFFD. For the replacement, I changed behavior if uN::read_replace to only jump one byte. Otherwise a common invalid case when ISO-8859-1 or WIN-1252 are read as UTF-8 would skip many characters. If skip_bom is true any bom at start of stream is ignored. If skip_bom is false any bom will be included. Input format can be forced, if not detect is used which will try to guess and then fallback to UTF-8.
Diffstat (limited to 'src/u8.hh')
-rw-r--r--src/u8.hh11
1 files changed, 8 insertions, 3 deletions
diff --git a/src/u8.hh b/src/u8.hh
index 3c1d19e..b89f80f 100644
--- a/src/u8.hh
+++ b/src/u8.hh
@@ -105,19 +105,24 @@ std::expected<uint32_t, u::ReadError> read(T& start, const T& end) {
template<std::forward_iterator T>
requires std::is_same_v<std::iter_value_t<T>, uint8_t>
std::expected<uint32_t, u::ReadErrorReplace> read_replace(T& start,
- const T& end) {
+ const T& end,
+ bool eof) {
+ auto const tmp = start;
auto ret = read(start, end);
if (ret.has_value())
return *ret;
switch (ret.error()) {
case u::ReadError::Incomplete:
+ if (eof)
+ break;
return std::unexpected(u::ReadErrorReplace::Incomplete);
case u::ReadError::End:
return std::unexpected(u::ReadErrorReplace::End);
case u::ReadError::Invalid:
- return 0xfffd;
+ break;
}
- std::unreachable();
+ start = tmp + 1;
+ return 0xfffd;
}
template<std::forward_iterator T>