summaryrefslogtreecommitdiff
path: root/utf/src
diff options
context:
space:
mode:
Diffstat (limited to 'utf/src')
-rw-r--r--utf/src/utf16.cc67
-rw-r--r--utf/src/utf32.cc43
-rw-r--r--utf/src/utf8.cc68
3 files changed, 178 insertions, 0 deletions
diff --git a/utf/src/utf16.cc b/utf/src/utf16.cc
new file mode 100644
index 0000000..43595bf
--- /dev/null
+++ b/utf/src/utf16.cc
@@ -0,0 +1,67 @@
+#include "utf16.hh"
+
+#include "utf_error.hh"
+
+namespace utf {
+
+namespace {
+
+inline bool is_high_surrogate(uint16_t c) {
+ return c >= 0xd800 && c <= 0xdbff;
+}
+
+inline bool is_low_surrogate(uint16_t c) {
+ return c >= 0xdc00 && c <= 0xdfff;
+}
+
+} // namespace
+
+uint32_t read16be(std::string_view data, std::size_t& offset) {
+ if (offset > data.size() || data.size() - offset < 2)
+ return NEED_MORE;
+ uint16_t c = static_cast<uint16_t>(data[offset]) << 8
+ | static_cast<uint16_t>(data[offset + 1] & 0xff);
+ if (is_high_surrogate(c)) {
+ if (data.size() - offset < 4)
+ return NEED_MORE;
+ uint16_t d = static_cast<uint16_t>(data[offset + 2]) << 8
+ | static_cast<uint16_t>(data[offset + 3] & 0xff);
+ if (is_low_surrogate(d)) {
+ offset += 4;
+ return 0x10000
+ + (static_cast<uint32_t>(c & 0x3ff) << 10
+ | (d & 0x3ff));
+ }
+ return INVALID;
+ } else if (is_low_surrogate(c)) {
+ return INVALID;
+ }
+ offset += 2;
+ return c;
+}
+
+uint32_t read16le(std::string_view data, std::size_t& offset) {
+ if (offset > data.size() || data.size() - offset < 2)
+ return NEED_MORE;
+ uint16_t c = static_cast<uint16_t>(data[offset + 1]) << 8
+ | static_cast<uint16_t>(data[offset] & 0xff);
+ if (is_high_surrogate(c)) {
+ if (data.size() - offset < 4)
+ return NEED_MORE;
+ uint16_t d = static_cast<uint16_t>(data[offset + 3]) << 8
+ | static_cast<uint16_t>(data[offset + 2] & 0xff);
+ if (is_low_surrogate(d)) {
+ offset += 4;
+ return 0x10000
+ + (static_cast<uint32_t>(c & 0x3ff) << 10
+ | (d & 0x3ff));
+ }
+ return INVALID;
+ } else if (is_low_surrogate(c)) {
+ return INVALID;
+ }
+ offset += 2;
+ return c;
+}
+
+} // namespace utf
diff --git a/utf/src/utf32.cc b/utf/src/utf32.cc
new file mode 100644
index 0000000..cfa29b6
--- /dev/null
+++ b/utf/src/utf32.cc
@@ -0,0 +1,43 @@
+#include "utf32.hh"
+
+#include "utf_error.hh"
+
+namespace utf {
+
+namespace {
+
+inline bool valid_codepoint(uint32_t c) {
+ return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff);
+}
+
+} // namespace
+
+uint32_t read32be(std::string_view data, std::size_t& offset) {
+ if (offset > data.size() || data.size() - offset < 4)
+ return NEED_MORE;
+ uint32_t c = static_cast<uint32_t>(data[offset]) << 24
+ | static_cast<uint32_t>(data[offset + 1] & 0xff) << 16
+ | static_cast<uint32_t>(data[offset + 2] & 0xff) << 8
+ | static_cast<uint32_t>(data[offset + 3] & 0xff);
+ if (valid_codepoint(c)) {
+ offset += 4;
+ return c;
+ }
+ return INVALID;
+}
+
+uint32_t read32le(std::string_view data, std::size_t& offset) {
+ if (offset > data.size() || data.size() - offset < 4)
+ return NEED_MORE;
+ uint32_t c = static_cast<uint32_t>(data[offset + 3]) << 24
+ | static_cast<uint32_t>(data[offset + 2] & 0xff) << 16
+ | static_cast<uint32_t>(data[offset + 1] & 0xff) << 8
+ | static_cast<uint32_t>(data[offset] & 0xff);
+ if (valid_codepoint(c)) {
+ offset += 4;
+ return c;
+ }
+ return INVALID;
+}
+
+} // namespace utf
diff --git a/utf/src/utf8.cc b/utf/src/utf8.cc
new file mode 100644
index 0000000..54b0296
--- /dev/null
+++ b/utf/src/utf8.cc
@@ -0,0 +1,68 @@
+#include "utf8.hh"
+
+#include "utf_error.hh"
+
+namespace utf {
+
+namespace {
+
+inline bool valid_codepoint(uint32_t c) {
+ return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff);
+}
+
+} // namespace
+
+uint32_t read8(std::string_view data, std::size_t& offset) {
+ if (offset >= data.size())
+ return NEED_MORE;
+ uint32_t ret;
+ uint8_t size;
+ switch (static_cast<uint8_t>(data[offset]) >> 4) {
+ case 15:
+ if (data[offset] & 0x08)
+ return INVALID;
+ ret = static_cast<uint32_t>(data[offset] & 0x07) << 18;
+ size = 4;
+ break;
+ case 14:
+ ret = static_cast<uint32_t>(data[offset] & 0x0f) << 12;
+ size = 3;
+ break;
+ case 13:
+ case 12:
+ ret = static_cast<uint32_t>(data[offset] & 0x1f) << 6;
+ size = 2;
+ break;
+ default:
+ if (data[offset] & 0x80)
+ return INVALID;
+ return data[offset++];
+ }
+ if (data.size() - offset < size)
+ return NEED_MORE;
+ for (uint8_t i = 1; i < size; ++i) {
+ if ((data[offset + i] & 0xc0) != 0x80)
+ return INVALID;
+ ret |= static_cast<uint32_t>(data[offset + i] & 0x3f) << (size - i - 1) * 6;
+ }
+ if (!valid_codepoint(ret))
+ return INVALID;
+ switch (size) {
+ case 4:
+ if (ret < 0x10000)
+ return INVALID;
+ break;
+ case 3:
+ if (ret < 0x800)
+ return INVALID;
+ break;
+ case 2:
+ if (ret < 0x80)
+ return INVALID;
+ break;
+ }
+ offset += size;
+ return ret;
+}
+
+} // namespace utf