summaryrefslogtreecommitdiff
path: root/src/u8.hh
diff options
context:
space:
mode:
Diffstat (limited to 'src/u8.hh')
-rw-r--r--src/u8.hh196
1 files changed, 196 insertions, 0 deletions
diff --git a/src/u8.hh b/src/u8.hh
new file mode 100644
index 0000000..d673caa
--- /dev/null
+++ b/src/u8.hh
@@ -0,0 +1,196 @@
+#ifndef U8_HH
+#define U8_HH
+
+#include "u.hh" // IWYU pragma: export
+
+#include <cstdint> // IWYU pragma: export
+#include <expected>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+
+namespace u8 {
+
+template <std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint8_t>
+std::expected<uint32_t, u::ReadError> read(T& start, T const& end) {
+ if (start == end)
+ return std::unexpected(u::ReadError::End);
+ uint32_t u;
+ switch (*start >> 4) {
+ case 0xf:
+ // 11110uvv 10vvwwww 10xxxxyy 10yyzzzz
+ if (std::distance(start, end) < 4) {
+ return std::unexpected(u::ReadError::Incomplete);
+ }
+ u = (*start & 0x07) << 18;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 3);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= (*start & 0x3f) << 12;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 2);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= (*start & 0x3f) << 6;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= *start & 0x3f;
+ if (u < 0x10000 || u > 0x10ffff) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ break;
+ case 0xe:
+ // 1110wwww 10xxxxyy 10yyzzzz
+ if (std::distance(start, end) < 3) {
+ return std::unexpected(u::ReadError::Incomplete);
+ }
+ u = (*start & 0x0f) << 12;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 2);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= (*start & 0x3f) << 6;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= *start & 0x3f;
+ if (u < 0x800 || (u >= 0xd800 && u <= 0xdfff)) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ break;
+ case 0xd:
+ case 0xc:
+ // 110xxxyy 10yyzzzz
+ if (std::distance(start, end) < 2) {
+ return std::unexpected(u::ReadError::Incomplete);
+ }
+ u = (*start & 0x1f) << 6;
+ std::advance(start, 1);
+ if ((*start & 0xc0) != 0x80) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ u |= *start & 0x3f;
+ if (u < 0x80) {
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ }
+ break;
+ case 0xb:
+ case 0xa:
+ case 0x9:
+ case 0x8:
+ std::advance(start, 1);
+ return std::unexpected(u::ReadError::Invalid);
+ default:
+ // 0yyyzzzz
+ u = *start;
+ break;
+ }
+ std::advance(start, 1);
+ return u;
+}
+
+template <std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint8_t>
+std::expected<uint32_t, u::ReadErrorReplace> read_replace(T& start,
+ T const& end,
+ bool eof) {
+ auto const tmp = start;
+ auto ret = read(start, end);
+ if (ret.has_value())
+ return *ret;
+ switch (ret.error()) {
+ case u::ReadError::Incomplete:
+ if (eof)
+ break;
+ return std::unexpected(u::ReadErrorReplace::Incomplete);
+ case u::ReadError::End:
+ return std::unexpected(u::ReadErrorReplace::End);
+ case u::ReadError::Invalid:
+ break;
+ }
+ start = tmp + 1;
+ return 0xfffd;
+}
+
+template <std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint8_t>
+bool write(T& start, T const& end, uint32_t code) {
+ if (code < 0x80) {
+ if (start == end)
+ return false;
+ *start = static_cast<uint8_t>(code);
+ } else if (code < 0x800) {
+ if (std::distance(start, end) < 2)
+ return false;
+ *start = 0xc0 | static_cast<uint8_t>(code >> 6);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>(code & 0x3f);
+ } else if (code < 0x10000) {
+ if (std::distance(start, end) < 3)
+ return false;
+ *start = 0xe0 | static_cast<uint8_t>(code >> 12);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>((code >> 6) & 0x3f);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>(code & 0x3f);
+ } else {
+ if (std::distance(start, end) < 4)
+ return false;
+ *start = 0xf0 | static_cast<uint8_t>(code >> 18);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>((code >> 12) & 0x3f);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>((code >> 6) & 0x3f);
+ std::advance(start, 1);
+ *start = 0x80 | static_cast<uint8_t>(code & 0x3f);
+ }
+ std::advance(start, 1);
+ return true;
+}
+
+template <std::forward_iterator T>
+ requires std::is_same_v<std::iter_value_t<T>, uint8_t>
+bool skip(T& start, T const& end) {
+ if (start == end)
+ return false;
+ switch (*start >> 4) {
+ case 0xf:
+ if (std::distance(start, end) < 4)
+ return false;
+ std::advance(start, 4);
+ break;
+ case 0xe:
+ if (std::distance(start, end) < 3)
+ return false;
+ std::advance(start, 3);
+ break;
+ case 0xc:
+ case 0xd:
+ if (std::distance(start, end) < 2)
+ return false;
+ std::advance(start, 2);
+ break;
+ default:
+ std::advance(start, 1);
+ break;
+ }
+ return true;
+}
+
+} // namespace u8
+
+#endif // U8_HH