summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@spawned.biz>2025-09-04 22:24:13 +0200
committerJoel Klinghed <the_jk@spawned.biz>2025-09-04 22:24:13 +0200
commit65860e6c873e6e056fe3d1dadd1d309b1bd66e7b (patch)
treecb59ed23c72b841fc2688606d68359b3f6b1e324 /src
parentd75b25d50f4df655d1e69ff900cfeee823039296 (diff)
Add UTF-8, UTF-16 and Modified UTF-8 support
Diffstat (limited to 'src')
-rw-r--r--src/u.hh19
-rw-r--r--src/u16.hh86
-rw-r--r--src/u8.hh180
-rw-r--r--src/umod8.hh186
4 files changed, 471 insertions, 0 deletions
diff --git a/src/u.hh b/src/u.hh
new file mode 100644
index 0000000..583b67b
--- /dev/null
+++ b/src/u.hh
@@ -0,0 +1,19 @@
+#ifndef U_HH
+#define U_HH
+
+namespace u {
+
+enum class ReadError {
+ Invalid, // Invalid sequence
+ End, // At end (it == end)
+ Incomplete, // Too few bytes
+};
+
+enum class ReadErrorReplace {
+ End, // At end (it == end)
+ Incomplete, // Too few bytes
+};
+
+} // namespace u
+
+#endif // U_HH
diff --git a/src/u16.hh b/src/u16.hh
new file mode 100644
index 0000000..6894a84
--- /dev/null
+++ b/src/u16.hh
@@ -0,0 +1,86 @@
+#ifndef U16_HH
+#define U16_HH
+
+#include <cstdint>
+#include <expected>
+#include <iterator>
+#include <type_traits>
+
+#include "u.hh"
+
+namespace u16 {
+
+template<std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint16_t>
+std::expected<uint32_t, u::ReadError> read(T& start, const T& end) {
+ if (start == end) return std::unexpected(u::ReadError::End);
+ uint16_t u = *start;
+ if (u >= 0xd800 && u <= 0xdbff) {
+ if (std::distance(start, end) < 2) {
+ return std::unexpected(u::ReadError::Incomplete);
+ }
+ std::advance(start, 1);
+ if (*start >= 0xdc00 && *start <= 0xdfff) {
+ uint16_t v = *start;
+ std::advance(start, 1);
+ return 0x10000 + (((u - 0xd800) << 10) | (v - 0xdc00));
+ }
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ std::advance(start, 1);
+ if (u >= 0xdc00 && u <= 0xdfff) {
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ return u;
+}
+
+template<std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint16_t>
+std::expected<uint32_t, u::ReadErrorReplace> read_replace(T& start,
+ const T& end) {
+ auto ret = read(start, end);
+ if (ret.has_value())
+ return *ret;
+ switch (ret.error()) {
+ case u::ReadError::Incomplete:
+ return std::unexpected(u::ReadErrorReplace::Incomplete);
+ case u::ReadError::End:
+ return std::unexpected(u::ReadErrorReplace::End);
+ case u::ReadError::Invalid:
+ return 0xfffd;
+ }
+}
+
+template<std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint16_t>
+bool write(T& start, const T& end, uint32_t code) {
+ if (code < 0x10000) {
+ if (start == end) return false;
+ *start = static_cast<uint16_t>(code);
+ } else {
+ if (std::distance(start, end) < 2) return false;
+ code -= 0x10000;
+ *start = static_cast<uint16_t>(0xd800 + (code >> 10));
+ std::advance(start, 1);
+ *start = static_cast<uint16_t>(0xdc00 + (code & 0x3ff));
+ }
+ std::advance(start, 1);
+ return true;
+}
+
+template<std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint16_t>
+bool skip(T& start, const T& end) {
+ if (start == end) return false;
+ if (*start >= 0xd800 && *start <= 0xdbff) {
+ if (std::distance(start, end) < 2) return false;
+ std::advance(start, 2);
+ return true;
+ }
+ std::advance(start, 1);
+ return true;
+}
+
+} // namespace u16
+
+#endif // U16_HH
diff --git a/src/u8.hh b/src/u8.hh
new file mode 100644
index 0000000..413b156
--- /dev/null
+++ b/src/u8.hh
@@ -0,0 +1,180 @@
+#ifndef U8_HH
+#define U8_HH
+
+#include <cstdint>
+#include <expected>
+#include <iterator>
+#include <type_traits>
+
+#include "u.hh"
+
+namespace u8 {
+
+template<std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint8_t>
+std::expected<uint32_t, u::ReadError> read(T& start, const T& end) {
+ if (start == end) return std::unexpected(u::ReadError::End);
+ uint32_t u;
+ switch (*start >> 4) {
+ case 0xf:
+ // 11110uvv 10vvwwww 10xxxxyy 10yyzzzz
+ if (std::distance(start, end) < 4) {
+ return std::unexpected(u::ReadError::Incomplete);
+ }
+ u = (*start & 0x07) << 18;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 3);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= (*start & 0x3f) << 12;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 2);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= (*start & 0x3f) << 6;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= *start & 0x3f;
+ if (u < 0x10000 || u > 0x10ffff) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ break;
+ case 0xe:
+ // 1110wwww 10xxxxyy 10yyzzzz
+ if (std::distance(start, end) < 3) {
+ return std::unexpected(u::ReadError::Incomplete);
+ }
+ u = (*start & 0x0f) << 12;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 2);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= (*start & 0x3f) << 6;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= *start & 0x3f;
+ if (u < 0x800 || (u >= 0xd800 && u <= 0xdfff)) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ break;
+ case 0xd:
+ case 0xc:
+ // 110xxxyy 10yyzzzz
+ if (std::distance(start, end) < 2) {
+ return std::unexpected(u::ReadError::Incomplete);
+ }
+ u = (*start & 0x1f) << 6;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= *start & 0x3f;
+ if (u < 0x80) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ break;
+ case 0xb:
+ case 0xa:
+ case 0x9:
+ case 0x8:
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ default:
+ // 0yyyzzzz
+ u = *start;
+ break;
+ }
+ std::advance(start, 1);
+ return u;
+}
+
+template<std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint8_t>
+std::expected<uint32_t, u::ReadErrorReplace> read_replace(T& start,
+ const T& end) {
+ auto ret = read(start, end);
+ if (ret.has_value())
+ return *ret;
+ switch (ret.error()) {
+ case u::ReadError::Incomplete:
+ return std::unexpected(u::ReadErrorReplace::Incomplete);
+ case u::ReadError::End:
+ return std::unexpected(u::ReadErrorReplace::End);
+ case u::ReadError::Invalid:
+ return 0xfffd;
+ }
+}
+
+template<std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint8_t>
+bool write(T& start, const T& end, uint32_t code) {
+ if (code < 0x80) {
+ if (start == end) return false;
+ *start = static_cast<uint8_t>(code);
+ } else if (code < 0x800) {
+ if (std::distance(start, end) < 2) return false;
+ *start = 0xc0 | static_cast<uint8_t>(code >> 6);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>(code & 0x3f);
+ } else if (code < 0x10000) {
+ if (std::distance(start, end) < 3) return false;
+ *start = 0xe0 | static_cast<uint8_t>(code >> 12);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>((code >> 6) & 0x3f);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>(code & 0x3f);
+ } else {
+ if (std::distance(start, end) < 4) return false;
+ *start = 0xf0 | static_cast<uint8_t>(code >> 18);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>((code >> 12) & 0x3f);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>((code >> 6) & 0x3f);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>(code & 0x3f);
+ }
+ std::advance(start, 1);
+ return true;
+}
+
+template<std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint8_t>
+bool skip(T& start, const T& end) {
+ if (start == end) return false;
+ switch (*start >> 4) {
+ case 0xf:
+ if (std::distance(start, end) < 4) return false;
+ std::advance(start, 4);
+ break;
+ case 0xe:
+ if (std::distance(start, end) < 3) return false;
+ std::advance(start, 3);
+ break;
+ case 0xc:
+ case 0xd:
+ if (std::distance(start, end) < 2) return false;
+ std::advance(start, 2);
+ break;
+ default:
+ std::advance(start, 1);
+ break;
+ }
+ return true;
+}
+
+} // namespace u8
+
+#endif // U8_HH
diff --git a/src/umod8.hh b/src/umod8.hh
new file mode 100644
index 0000000..8d4fdb2
--- /dev/null
+++ b/src/umod8.hh
@@ -0,0 +1,186 @@
+#ifndef UMOD8_HH
+#define UMOD8_HH
+
+#include <cstdint>
+#include <expected>
+#include <iterator>
+#include <type_traits>
+
+#include "u.hh"
+
+namespace umod8 {
+
+template<std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint8_t>
+std::expected<uint32_t, u::ReadError> read(T& start, const T& end) {
+ if (start == end) return std::unexpected(u::ReadError::End);
+ uint32_t u;
+ switch (*start >> 4) {
+ case 0xe: {
+ auto const tmp = start;
+ // 1110wwww 10xxxxyy 10yyzzzz
+ if (std::distance(start, end) < 3) {
+ return std::unexpected(u::ReadError::Incomplete);
+ }
+ u = (*start & 0x0f) << 12;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 2);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= (*start & 0x3f) << 6;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= *start & 0x3f;
+ if (u < 0x800) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ if (u >= 0xd800 && u <= 0xdbff) {
+ std::advance(start, 1);
+ // Not going recursive here as we don't want it unbounded
+ // Lone surrogate pair at end == invalid.
+ if (start == end) return std::unexpected(u::ReadError::Invalid);
+ if ((*start >> 4) == 0xe) {
+ if (std::distance(start, end) < 3) {
+ start = tmp;
+ return std::unexpected(u::ReadError::Incomplete);
+ }
+ uint32_t v = (*start & 0x0f) << 12;
+ std::advance(start, 1);
+ if ((*start & 0xc0) == 0x80) {
+ v |= (*start & 0x3f) << 6;
+ std::advance(start, 1);
+ if ((*start & 0xc0) == 0x80) {
+ v |= *start & 0x3f;
+ if (v >= 0xdc00 && v <= 0xdfff) {
+ std::advance(start, 1);
+ return 0x10000 + (((u - 0xd800) << 10) | (v - 0xdc00));
+ }
+ }
+ }
+ start = std::next(tmp, 3);
+ }
+ // Next character may be valid, invalid, something, but we know
+ // it is not the second half of a surrogate pair, so consider
+ // this first part invalid.
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ if (u >= 0xdc00 && u <= 0xdfff) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ break;
+ }
+ case 0xd:
+ case 0xc:
+ // 110xxxyy 10yyzzzz
+ if (std::distance(start, end) < 2) {
+ return std::unexpected(u::ReadError::Incomplete);
+ }
+ u = (*start & 0x1f) << 6;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= *start & 0x3f;
+ if (u > 0 && u < 0x80) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ break;
+ case 0xf:
+ case 0xb:
+ case 0xa:
+ case 0x9:
+ case 0x8:
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ default:
+ // 0yyyzzzz
+ u = *start;
+ break;
+ }
+ std::advance(start, 1);
+ return u;
+}
+
+template<std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint8_t>
+std::expected<uint32_t, u::ReadErrorReplace> read_replace(T& start,
+ const T& end) {
+ auto ret = read(start, end);
+ if (ret.has_value())
+ return *ret;
+ switch (ret.error()) {
+ case u::ReadError::Incomplete:
+ return std::unexpected(u::ReadErrorReplace::Incomplete);
+ case u::ReadError::End:
+ return std::unexpected(u::ReadErrorReplace::End);
+ case u::ReadError::Invalid:
+ return 0xfffd;
+ }
+}
+
+template<std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint8_t>
+bool write(T& start, const T& end, uint32_t code) {
+ if (code > 0 && code < 0x80) {
+ if (start == end) return false;
+ *start = static_cast<uint8_t>(code);
+ } else if (code < 0x800) {
+ if (std::distance(start, end) < 2) return false;
+ *start = 0xc0 | static_cast<uint8_t>(code >> 6);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>(code & 0x3f);
+ } else if (code < 0x10000) {
+ if (std::distance(start, end) < 3) return false;
+ *start = 0xe0 | static_cast<uint8_t>(code >> 12);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>((code >> 6) & 0x3f);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>(code & 0x3f);
+ } else {
+ auto tmp = start;
+ code -= 0x10000;
+ if (write(start, end, 0xd800 + (code >> 10)) &&
+ write(start, end, 0xdc00 + (code & 0x3ff))) {
+ return true;
+ }
+ start = tmp;
+ return false;
+ }
+ std::advance(start, 1);
+ return true;
+}
+
+template<std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint8_t>
+bool skip(T& start, const T& end) {
+ if (start == end) return false;
+ switch (*start >> 4) {
+ case 0xe: {
+ auto tmp = start;
+ if (read(start, end).has_value()) return true;
+ start = tmp;
+ return false;
+ }
+ case 0xc:
+ case 0xd:
+ if (std::distance(start, end) < 2) return false;
+ std::advance(start, 2);
+ break;
+ default:
+ std::advance(start, 1);
+ break;
+ }
+ return true;
+}
+
+} // namespace umod8
+
+#endif // UMOD8_HH