summaryrefslogtreecommitdiff
path: root/utf
diff options
context:
space:
mode:
Diffstat (limited to 'utf')
-rw-r--r--utf/inc/utf16.hh31
-rw-r--r--utf/inc/utf32.hh29
-rw-r--r--utf/inc/utf8.hh22
-rw-r--r--utf/inc/utf_error.hh13
-rw-r--r--utf/meson.build38
-rw-r--r--utf/src/utf16.cc67
-rw-r--r--utf/src/utf32.cc43
-rw-r--r--utf/src/utf8.cc68
-rw-r--r--utf/tst/test_utf16.cc157
-rw-r--r--utf/tst/test_utf32.cc145
-rw-r--r--utf/tst/test_utf8.cc188
11 files changed, 801 insertions, 0 deletions
diff --git a/utf/inc/utf16.hh b/utf/inc/utf16.hh
new file mode 100644
index 0000000..344b1a2
--- /dev/null
+++ b/utf/inc/utf16.hh
@@ -0,0 +1,31 @@
+#ifndef UTF_UTF16_HH
+#define UTF_UTF16_HH
+
+#include "macros.hh"
+
+#include <cstdint>
+#include <string_view>
+
+namespace utf {
+
+/* Read one unicode codepoint from UTF-16 BigEndian encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs,
+ * returns INVALID.
+ */
+uint32_t HIDDEN read16be(std::string_view data, std::size_t& offset);
+
+/* Read one unicode codepoint from UTF-16 LittleEndian encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs,
+ * returns INVALID.
+ */
+uint32_t HIDDEN read16le(std::string_view data, std::size_t& offset);
+
+} // namespace utf
+
+#endif // UTF_UTF16_HH
diff --git a/utf/inc/utf32.hh b/utf/inc/utf32.hh
new file mode 100644
index 0000000..2d3088e
--- /dev/null
+++ b/utf/inc/utf32.hh
@@ -0,0 +1,29 @@
+#ifndef UTF_UTF32_HH
+#define UTF_UTF32_HH
+
+#include "macros.hh"
+
+#include <cstdint>
+#include <string_view>
+
+namespace utf {
+
+/* Read one unicode codepoint from UTF-32 BigEndian encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-32, ie. outside valid ranges, returns INVALID.
+ */
+uint32_t HIDDEN read32be(std::string_view data, std::size_t& offset);
+
+/* Read one unicode codepoint from UTF-32 LittleEndian encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-32, ie. outside valid ranges, returns INVALID.
+ */
+uint32_t HIDDEN read32le(std::string_view data, std::size_t& offset);
+
+} // namespace utf
+
+#endif // UTF_UTF32_HH
diff --git a/utf/inc/utf8.hh b/utf/inc/utf8.hh
new file mode 100644
index 0000000..a3ea84a
--- /dev/null
+++ b/utf/inc/utf8.hh
@@ -0,0 +1,22 @@
+#ifndef UTF_UTF8_HH
+#define UTF_UTF8_HH
+
+#include "macros.hh"
+
+#include <cstdint>
+#include <string_view>
+
+namespace utf {
+
+/* Read one unicode codepoint from UTF-8 encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-8, this includes overlong encodings and
+ * invalid unicode code points, returns INVALID.
+ */
+uint32_t HIDDEN read8(std::string_view data, std::size_t& offset);
+
+} // namespace utf
+
+#endif // UTF_UTF8_HH
diff --git a/utf/inc/utf_error.hh b/utf/inc/utf_error.hh
new file mode 100644
index 0000000..079fa43
--- /dev/null
+++ b/utf/inc/utf_error.hh
@@ -0,0 +1,13 @@
+#ifndef UTF_ERROR_HH
+#define UTF_ERROR_HH
+
+#include <cstdint>
+
+namespace utf {
+
+constexpr uint32_t NEED_MORE = 0xfffffffe;
+constexpr uint32_t INVALID = 0xffffffff;
+
+} // namespace utf
+
+#endif // UTF_ERROR_HH
diff --git a/utf/meson.build b/utf/meson.build
new file mode 100644
index 0000000..64db6ff
--- /dev/null
+++ b/utf/meson.build
@@ -0,0 +1,38 @@
+deps = [
+ base_dep,
+]
+
+inc = include_directories('inc')
+lib = static_library(
+ 'utf',
+ 'src/utf8.cc',
+ 'src/utf16.cc',
+ 'src/utf32.cc',
+ dependencies: deps,
+ include_directories: inc,
+ install: false,
+)
+
+utf_dep = declare_dependency(
+ dependencies: deps,
+ include_directories: inc,
+ link_with: lib,
+)
+
+test('utf8',
+ executable(
+ 'test_utf8',
+ sources: ['tst/test_utf8.cc'],
+ dependencies: [utf_dep, gtest_dep]))
+
+test('utf16',
+ executable(
+ 'test_utf16',
+ sources: ['tst/test_utf16.cc'],
+ dependencies: [utf_dep, gtest_dep]))
+
+test('utf32',
+ executable(
+ 'test_utf32',
+ sources: ['tst/test_utf32.cc'],
+ dependencies: [utf_dep, gtest_dep]))
diff --git a/utf/src/utf16.cc b/utf/src/utf16.cc
new file mode 100644
index 0000000..43595bf
--- /dev/null
+++ b/utf/src/utf16.cc
@@ -0,0 +1,67 @@
+#include "utf16.hh"
+
+#include "utf_error.hh"
+
+namespace utf {
+
+namespace {
+
+inline bool is_high_surrogate(uint16_t c) {
+ return c >= 0xd800 && c <= 0xdbff;
+}
+
+inline bool is_low_surrogate(uint16_t c) {
+ return c >= 0xdc00 && c <= 0xdfff;
+}
+
+} // namespace
+
+uint32_t read16be(std::string_view data, std::size_t& offset) {
+ if (offset > data.size() || data.size() - offset < 2)
+ return NEED_MORE;
+ uint16_t c = static_cast<uint16_t>(data[offset]) << 8
+ | static_cast<uint16_t>(data[offset + 1] & 0xff);
+ if (is_high_surrogate(c)) {
+ if (data.size() - offset < 4)
+ return NEED_MORE;
+ uint16_t d = static_cast<uint16_t>(data[offset + 2]) << 8
+ | static_cast<uint16_t>(data[offset + 3] & 0xff);
+ if (is_low_surrogate(d)) {
+ offset += 4;
+ return 0x10000
+ + (static_cast<uint32_t>(c & 0x3ff) << 10
+ | (d & 0x3ff));
+ }
+ return INVALID;
+ } else if (is_low_surrogate(c)) {
+ return INVALID;
+ }
+ offset += 2;
+ return c;
+}
+
+uint32_t read16le(std::string_view data, std::size_t& offset) {
+ if (offset > data.size() || data.size() - offset < 2)
+ return NEED_MORE;
+ uint16_t c = static_cast<uint16_t>(data[offset + 1]) << 8
+ | static_cast<uint16_t>(data[offset] & 0xff);
+ if (is_high_surrogate(c)) {
+ if (data.size() - offset < 4)
+ return NEED_MORE;
+ uint16_t d = static_cast<uint16_t>(data[offset + 3]) << 8
+ | static_cast<uint16_t>(data[offset + 2] & 0xff);
+ if (is_low_surrogate(d)) {
+ offset += 4;
+ return 0x10000
+ + (static_cast<uint32_t>(c & 0x3ff) << 10
+ | (d & 0x3ff));
+ }
+ return INVALID;
+ } else if (is_low_surrogate(c)) {
+ return INVALID;
+ }
+ offset += 2;
+ return c;
+}
+
+} // namespace utf
diff --git a/utf/src/utf32.cc b/utf/src/utf32.cc
new file mode 100644
index 0000000..cfa29b6
--- /dev/null
+++ b/utf/src/utf32.cc
@@ -0,0 +1,43 @@
+#include "utf32.hh"
+
+#include "utf_error.hh"
+
+namespace utf {
+
+namespace {
+
+inline bool valid_codepoint(uint32_t c) {
+ return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff);
+}
+
+} // namespace
+
+uint32_t read32be(std::string_view data, std::size_t& offset) {
+ if (offset > data.size() || data.size() - offset < 4)
+ return NEED_MORE;
+ uint32_t c = static_cast<uint32_t>(data[offset]) << 24
+ | static_cast<uint32_t>(data[offset + 1] & 0xff) << 16
+ | static_cast<uint32_t>(data[offset + 2] & 0xff) << 8
+ | static_cast<uint32_t>(data[offset + 3] & 0xff);
+ if (valid_codepoint(c)) {
+ offset += 4;
+ return c;
+ }
+ return INVALID;
+}
+
+uint32_t read32le(std::string_view data, std::size_t& offset) {
+ if (offset > data.size() || data.size() - offset < 4)
+ return NEED_MORE;
+ uint32_t c = static_cast<uint32_t>(data[offset + 3]) << 24
+ | static_cast<uint32_t>(data[offset + 2] & 0xff) << 16
+ | static_cast<uint32_t>(data[offset + 1] & 0xff) << 8
+ | static_cast<uint32_t>(data[offset] & 0xff);
+ if (valid_codepoint(c)) {
+ offset += 4;
+ return c;
+ }
+ return INVALID;
+}
+
+} // namespace utf
diff --git a/utf/src/utf8.cc b/utf/src/utf8.cc
new file mode 100644
index 0000000..54b0296
--- /dev/null
+++ b/utf/src/utf8.cc
@@ -0,0 +1,68 @@
+#include "utf8.hh"
+
+#include "utf_error.hh"
+
+namespace utf {
+
+namespace {
+
+inline bool valid_codepoint(uint32_t c) {
+ return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff);
+}
+
+} // namespace
+
+uint32_t read8(std::string_view data, std::size_t& offset) {
+ if (offset >= data.size())
+ return NEED_MORE;
+ uint32_t ret;
+ uint8_t size;
+ switch (static_cast<uint8_t>(data[offset]) >> 4) {
+ case 15:
+ if (data[offset] & 0x08)
+ return INVALID;
+ ret = static_cast<uint32_t>(data[offset] & 0x07) << 18;
+ size = 4;
+ break;
+ case 14:
+ ret = static_cast<uint32_t>(data[offset] & 0x0f) << 12;
+ size = 3;
+ break;
+ case 13:
+ case 12:
+ ret = static_cast<uint32_t>(data[offset] & 0x1f) << 6;
+ size = 2;
+ break;
+ default:
+ if (data[offset] & 0x80)
+ return INVALID;
+ return data[offset++];
+ }
+ if (data.size() - offset < size)
+ return NEED_MORE;
+ for (uint8_t i = 1; i < size; ++i) {
+ if ((data[offset + i] & 0xc0) != 0x80)
+ return INVALID;
+ ret |= static_cast<uint32_t>(data[offset + i] & 0x3f) << (size - i - 1) * 6;
+ }
+ if (!valid_codepoint(ret))
+ return INVALID;
+ switch (size) {
+ case 4:
+ if (ret < 0x10000)
+ return INVALID;
+ break;
+ case 3:
+ if (ret < 0x800)
+ return INVALID;
+ break;
+ case 2:
+ if (ret < 0x80)
+ return INVALID;
+ break;
+ }
+ offset += size;
+ return ret;
+}
+
+} // namespace utf
diff --git a/utf/tst/test_utf16.cc b/utf/tst/test_utf16.cc
new file mode 100644
index 0000000..c17982e
--- /dev/null
+++ b/utf/tst/test_utf16.cc
@@ -0,0 +1,157 @@
+#include "utf16.hh"
+
+#include "utf_error.hh"
+
+#include <gtest/gtest.h>
+
+TEST(utf16be, sanity) {
+ std::string_view str("\x00\x24", 2);
+ size_t offset = 0;
+ auto ret = utf::read16be(str, offset);
+ EXPECT_EQ('$', ret);
+ EXPECT_EQ(2, offset);
+
+ str = "\x20\xAC";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ EXPECT_EQ(2, offset);
+
+ str = "\xD8\x01\xDC\x37";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(0x10437, ret);
+ EXPECT_EQ(4, offset);
+
+ str = "\xD8\x52\xDF\x62";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(0x24B62, ret);
+ EXPECT_EQ(4, offset);
+}
+
+TEST(utf16le, sanity) {
+ std::string_view str("\x24\x00", 2);
+ size_t offset = 0;
+ auto ret = utf::read16le(str, offset);
+ EXPECT_EQ('$', ret);
+ EXPECT_EQ(2, offset);
+
+ str = "\xAC\x20";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ EXPECT_EQ(2, offset);
+
+ str = "\x01\xD8\x37\xDC";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(0x10437, ret);
+ EXPECT_EQ(4, offset);
+
+ str = "\x52\xD8\x62\xDF";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(0x24B62, ret);
+ EXPECT_EQ(4, offset);
+}
+
+TEST(utf16be, bom) {
+ std::string_view str("\xFE\xFF\x20\xAC");
+ size_t offset = 0;
+ auto ret = utf::read16be(str, offset);
+ EXPECT_EQ(0xFEFF, ret);
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(str.size(), offset);
+}
+
+TEST(utf16le, bom) {
+ std::string_view str("\xFF\xFE\xAC\x20");
+ size_t offset = 0;
+ auto ret = utf::read16le(str, offset);
+ EXPECT_EQ(0xFEFF, ret);
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(str.size(), offset);
+}
+
+TEST(utf16be, invalid) {
+ std::string_view str("\xD8");
+ size_t offset = 0;
+ auto ret = utf::read16be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xD8\x01";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xD8\x01\xDC";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xDC\x37\xD8\x01";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xD8\x01\xD8\x01";
+ offset = 0;
+ ret = utf::read16be(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+}
+
+TEST(utf16le, invalid) {
+ std::string_view str("\x01");
+ size_t offset = 0;
+ auto ret = utf::read16le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\x01\xD8";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\x01\xD8\x37";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\x37\xDC\x01\xD8";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\x01\xD8\x01\xD8";
+ offset = 0;
+ ret = utf::read16le(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+}
diff --git a/utf/tst/test_utf32.cc b/utf/tst/test_utf32.cc
new file mode 100644
index 0000000..796b4cd
--- /dev/null
+++ b/utf/tst/test_utf32.cc
@@ -0,0 +1,145 @@
+#include "utf32.hh"
+
+#include "utf_error.hh"
+
+#include <gtest/gtest.h>
+
+TEST(utf32be, sanity) {
+ std::string_view str("\x00\x00\x00\x24", 4);
+ size_t offset = 0;
+ auto ret = utf::read32be(str, offset);
+ EXPECT_EQ('$', ret);
+ EXPECT_EQ(4, offset);
+
+ str = std::string_view("\x00\x00\x20\xAC", 4);
+ offset = 0;
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ EXPECT_EQ(4, offset);
+
+ str = std::string_view("\x00\x01\x04\x37", 4);
+ offset = 0;
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(0x10437, ret);
+ EXPECT_EQ(4, offset);
+}
+
+TEST(utf32le, sanity) {
+ std::string_view str("\x24\x00\x00\x00", 4);
+ size_t offset = 0;
+ auto ret = utf::read32le(str, offset);
+ EXPECT_EQ('$', ret);
+ EXPECT_EQ(4, offset);
+
+ str = std::string_view("\xAC\x20\x00\x00", 4);
+ offset = 0;
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ EXPECT_EQ(4, offset);
+
+ str = std::string_view("\x37\x04\x01\x00", 4);
+ offset = 0;
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(0x10437, ret);
+ EXPECT_EQ(4, offset);
+}
+
+TEST(utf32be, invalid) {
+ std::string_view str("\xFF\xFF\xFF\xFF");
+ size_t offset = 0;
+ auto ret = utf::read32be(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00\x00\xD8\x00", 4);
+ offset = 0;
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "";
+ offset = 0;
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00", 1);
+ offset = 0;
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00\x00", 2);
+ offset = 0;
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00\x00\x00", 3);
+ offset = 0;
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+}
+
+TEST(utf32le, invalid) {
+ std::string_view str("\xFF\xFF\xFF\xFF");
+ size_t offset = 0;
+ auto ret = utf::read32le(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00\xD8\x00\x00", 4);
+ offset = 0;
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "";
+ offset = 0;
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00", 1);
+ offset = 0;
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00\x00", 2);
+ offset = 0;
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = std::string_view("\x00\x00\x00", 3);
+ offset = 0;
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+}
+
+TEST(utf32be, bom) {
+ std::string_view str("\x00\x00\xFF\xFE\x00\x00\x20\xAC", 8);
+ size_t offset = 0;
+ auto ret = utf::read32be(str, offset);
+ EXPECT_EQ(0xFFFE, ret);
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ ret = utf::read32be(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(str.size(), offset);
+}
+
+TEST(utf32le, bom) {
+ std::string_view str("\xFE\xFF\x00\x00\xAC\x20\x00\x00", 8);
+ size_t offset = 0;
+ auto ret = utf::read32le(str, offset);
+ EXPECT_EQ(0xFFFE, ret);
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ ret = utf::read32le(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(str.size(), offset);
+}
diff --git a/utf/tst/test_utf8.cc b/utf/tst/test_utf8.cc
new file mode 100644
index 0000000..10df969
--- /dev/null
+++ b/utf/tst/test_utf8.cc
@@ -0,0 +1,188 @@
+#include "utf8.hh"
+
+#include "utf_error.hh"
+
+#include <gtest/gtest.h>
+
+TEST(utf8, sanity) {
+ std::string_view str("$");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ('$', ret);
+ EXPECT_EQ(1, offset);
+
+ str = "\xC2\xA3";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0xa3, ret);
+ EXPECT_EQ(2, offset);
+
+ str = "\xD0\x98";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x418, ret);
+ EXPECT_EQ(2, offset);
+
+ str = "\xE0\xA4\xB9";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x939, ret);
+ EXPECT_EQ(3, offset);
+
+ str = "\xE2\x82\xAC";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ EXPECT_EQ(3, offset);
+
+ str = "\xED\x95\x9C";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0xD55C, ret);
+ EXPECT_EQ(3, offset);
+
+ str = "\xF0\x90\x8D\x88";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x10348, ret);
+ EXPECT_EQ(4, offset);
+}
+
+TEST(utf8, overlong) {
+ std::string_view str("\xF0\x82\x82\xAC");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xE0\x81\x81";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xC0\x80";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+}
+
+TEST(utf8, invalid) {
+ std::string_view str("\xED\xB0\x80");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xFB\xFF\xFF";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xFF\xFF\xFF\xFF\xFF";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\x80";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xC2";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xC2\x03";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xE0\xA4";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xF0\x90\x8D";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+}
+
+TEST(utf8, multiple1) {
+ std::string_view str("\x4D\xC3\xAC\x6E\x68\x20\x6E\xC3\xB3\x69\x20\x74\x69"
+ "\xE1\xBA\xBF\x6E\x67\x20\x56\x69\xE1\xBB\x87\x74");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ('M', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0xEC, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('n', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('h', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(' ', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('n', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0xF3, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('i', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(' ', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('t', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('i', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x1EBF, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('n', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('g', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(' ', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('V', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('i', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x1EC7, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('t', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(str.size(), offset);
+}
+
+TEST(utf8, multiple2) {
+ std::string_view str("\xF0\xA8\x89\x9F\xE5\x91\x90\xE3\x97\x82\xE8\xB6\x8A");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ(0x2825F, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x5450, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x35C2, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x8D8A, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(str.size(), offset);
+}