summaryrefslogtreecommitdiff
path: root/utf/src/utf8.cc
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@spawned.biz>2023-06-13 10:07:16 +0200
committerJoel Klinghed <the_jk@spawned.biz>2023-06-13 10:07:16 +0200
commitfc4547b412e28164af1bf8981234c6af959ccc0b (patch)
tree061253e7a4f6abaca282223b36d10f0bed8cad23 /utf/src/utf8.cc
WIP
Diffstat (limited to 'utf/src/utf8.cc')
-rw-r--r--utf/src/utf8.cc68
1 files changed, 68 insertions, 0 deletions
diff --git a/utf/src/utf8.cc b/utf/src/utf8.cc
new file mode 100644
index 0000000..54b0296
--- /dev/null
+++ b/utf/src/utf8.cc
@@ -0,0 +1,68 @@
+#include "utf8.hh"
+
+#include "utf_error.hh"
+
+namespace utf {
+
+namespace {
+
+inline bool valid_codepoint(uint32_t c) {
+ return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff);
+}
+
+} // namespace
+
+uint32_t read8(std::string_view data, std::size_t& offset) {
+ if (offset >= data.size())
+ return NEED_MORE;
+ uint32_t ret;
+ uint8_t size;
+ switch (static_cast<uint8_t>(data[offset]) >> 4) {
+ case 15:
+ if (data[offset] & 0x08)
+ return INVALID;
+ ret = static_cast<uint32_t>(data[offset] & 0x07) << 18;
+ size = 4;
+ break;
+ case 14:
+ ret = static_cast<uint32_t>(data[offset] & 0x0f) << 12;
+ size = 3;
+ break;
+ case 13:
+ case 12:
+ ret = static_cast<uint32_t>(data[offset] & 0x1f) << 6;
+ size = 2;
+ break;
+ default:
+ if (data[offset] & 0x80)
+ return INVALID;
+ return data[offset++];
+ }
+ if (data.size() - offset < size)
+ return NEED_MORE;
+ for (uint8_t i = 1; i < size; ++i) {
+ if ((data[offset + i] & 0xc0) != 0x80)
+ return INVALID;
+ ret |= static_cast<uint32_t>(data[offset + i] & 0x3f) << (size - i - 1) * 6;
+ }
+ if (!valid_codepoint(ret))
+ return INVALID;
+ switch (size) {
+ case 4:
+ if (ret < 0x10000)
+ return INVALID;
+ break;
+ case 3:
+ if (ret < 0x800)
+ return INVALID;
+ break;
+ case 2:
+ if (ret < 0x80)
+ return INVALID;
+ break;
+ }
+ offset += size;
+ return ret;
+}
+
+} // namespace utf