WIP

author: Joel Klinghed <the_jk@spawned.biz> 2023-06-13 10:07:16 +0200
committer: Joel Klinghed <the_jk@spawned.biz> 2023-06-13 10:07:16 +0200
commit: fc4547b412e28164af1bf8981234c6af959ccc0b (patch)
tree: 061253e7a4f6abaca282223b36d10f0bed8cad23 /utf
11 files changed, 801 insertions, 0 deletions
diff --git a/utf/inc/utf16.hh b/utf/inc/utf16.hh
new file mode 100644
index 0000000..344b1a2
--- /dev/null
+++ b/utf/inc/utf16.hh
@@ -0,0 +1,31 @@
+#ifndef UTF_UTF16_HH
+#define UTF_UTF16_HH
+
+#include "macros.hh"
+
+#include <cstdint>
+#include <string_view>
+
+namespace utf {
+
+/* Read one unicode codepoint from UTF-16 BigEndian encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs,
+ *   returns INVALID.
+ */
+uint32_t HIDDEN read16be(std::string_view data, std::size_t& offset);
+
+/* Read one unicode codepoint from UTF-16 LittleEndian encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs,
+ *   returns INVALID.
+ */
+uint32_t HIDDEN read16le(std::string_view data, std::size_t& offset);
+
+}  // namespace utf
+
+#endif  // UTF_UTF16_HH
diff --git a/utf/inc/utf32.hh b/utf/inc/utf32.hh
new file mode 100644
index 0000000..2d3088e
--- /dev/null
+++ b/utf/inc/utf32.hh
@@ -0,0 +1,29 @@
+#ifndef UTF_UTF32_HH
+#define UTF_UTF32_HH
+
+#include "macros.hh"
+
+#include <cstdint>
+#include <string_view>
+
+namespace utf {
+
+/* Read one unicode codepoint from UTF-32 BigEndian encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-32, ie. outside valid ranges, returns INVALID.
+ */
+uint32_t HIDDEN read32be(std::string_view data, std::size_t& offset);
+
+/* Read one unicode codepoint from UTF-32 LittleEndian encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-32, ie. outside valid ranges, returns INVALID.
+ */
+uint32_t HIDDEN read32le(std::string_view data, std::size_t& offset);
+
+}  // namespace utf
+
+#endif  // UTF_UTF32_HH
diff --git a/utf/inc/utf8.hh b/utf/inc/utf8.hh
new file mode 100644
index 0000000..a3ea84a
--- /dev/null
+++ b/utf/inc/utf8.hh
@@ -0,0 +1,22 @@
+#ifndef UTF_UTF8_HH
+#define UTF_UTF8_HH
+
+#include "macros.hh"
+
+#include <cstdint>
+#include <string_view>
+
+namespace utf {
+
+/* Read one unicode codepoint from UTF-8 encoded data if possible.
+ * If successfull offset is incremented to point to next codepoint.
+ * Will fail:
+ * - not enough data is left in data given offset, returns NEED_MORE.
+ * - data is not valid UTF-8, this includes overlong encodings and
+ *   invalid unicode code points, returns INVALID.
+ */
+uint32_t HIDDEN read8(std::string_view data, std::size_t& offset);
+
+}  // namespace utf
+
+#endif  // UTF_UTF8_HH
diff --git a/utf/inc/utf_error.hh b/utf/inc/utf_error.hh
new file mode 100644
index 0000000..079fa43
--- /dev/null
+++ b/utf/inc/utf_error.hh
@@ -0,0 +1,13 @@
+#ifndef UTF_ERROR_HH
+#define UTF_ERROR_HH
+
+#include <cstdint>
+
+namespace utf {
+
+constexpr uint32_t NEED_MORE = 0xfffffffe;
+constexpr uint32_t INVALID = 0xffffffff;
+
+}  // namespace utf
+
+#endif  // UTF_ERROR_HH
diff --git a/utf/meson.build b/utf/meson.build
new file mode 100644
index 0000000..64db6ff
--- /dev/null
+++ b/utf/meson.build
@@ -0,0 +1,38 @@
+deps = [
+  base_dep,
+]
+
+inc = include_directories('inc')
+lib = static_library(
+  'utf',
+  'src/utf8.cc',
+  'src/utf16.cc',
+  'src/utf32.cc',
+  dependencies: deps,
+  include_directories: inc,
+  install: false,
+)
+
+utf_dep = declare_dependency(
+  dependencies: deps,
+  include_directories: inc,
+  link_with: lib,
+)
+
+test('utf8',
+     executable(
+       'test_utf8',
+       sources: ['tst/test_utf8.cc'],
+       dependencies: [utf_dep, gtest_dep]))
+
+test('utf16',
+     executable(
+       'test_utf16',
+       sources: ['tst/test_utf16.cc'],
+       dependencies: [utf_dep, gtest_dep]))
+
+test('utf32',
+     executable(
+       'test_utf32',
+       sources: ['tst/test_utf32.cc'],
+       dependencies: [utf_dep, gtest_dep]))
diff --git a/utf/src/utf16.cc b/utf/src/utf16.cc
new file mode 100644
index 0000000..43595bf
--- /dev/null
+++ b/utf/src/utf16.cc
@@ -0,0 +1,67 @@
+#include "utf16.hh"
+
+#include "utf_error.hh"
+
+namespace utf {
+
+namespace {
+
+inline bool is_high_surrogate(uint16_t c) {
+  return c >= 0xd800 && c <= 0xdbff;
+}
+
+inline bool is_low_surrogate(uint16_t c) {
+  return c >= 0xdc00 && c <= 0xdfff;
+}
+
+}  // namespace
+
+uint32_t read16be(std::string_view data, std::size_t& offset) {
+  if (offset > data.size() || data.size() - offset < 2)
+    return NEED_MORE;
+  uint16_t c = static_cast<uint16_t>(data[offset]) << 8
+               | static_cast<uint16_t>(data[offset + 1] & 0xff);
+  if (is_high_surrogate(c)) {
+    if (data.size() - offset < 4)
+      return NEED_MORE;
+    uint16_t d = static_cast<uint16_t>(data[offset + 2]) << 8
+                 | static_cast<uint16_t>(data[offset + 3] & 0xff);
+    if (is_low_surrogate(d)) {
+      offset += 4;
+      return 0x10000
+          + (static_cast<uint32_t>(c & 0x3ff) << 10
+             | (d & 0x3ff));
+    }
+    return INVALID;
+  } else if (is_low_surrogate(c)) {
+    return INVALID;
+  }
+  offset += 2;
+  return c;
+}
+
+uint32_t read16le(std::string_view data, std::size_t& offset) {
+  if (offset > data.size() || data.size() - offset < 2)
+    return NEED_MORE;
+  uint16_t c = static_cast<uint16_t>(data[offset + 1]) << 8
+               | static_cast<uint16_t>(data[offset] & 0xff);
+  if (is_high_surrogate(c)) {
+    if (data.size() - offset < 4)
+      return NEED_MORE;
+    uint16_t d = static_cast<uint16_t>(data[offset + 3]) << 8
+                 | static_cast<uint16_t>(data[offset + 2] & 0xff);
+    if (is_low_surrogate(d)) {
+      offset += 4;
+      return 0x10000
+          + (static_cast<uint32_t>(c & 0x3ff) << 10
+             | (d & 0x3ff));
+    }
+    return INVALID;
+  } else if (is_low_surrogate(c)) {
+    return INVALID;
+  }
+  offset += 2;
+  return c;
+}
+
+}  // namespace utf
diff --git a/utf/src/utf32.cc b/utf/src/utf32.cc
new file mode 100644
index 0000000..cfa29b6
--- /dev/null
+++ b/utf/src/utf32.cc
@@ -0,0 +1,43 @@
+#include "utf32.hh"
+
+#include "utf_error.hh"
+
+namespace utf {
+
+namespace {
+
+inline bool valid_codepoint(uint32_t c) {
+  return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff);
+}
+
+}  // namespace
+
+uint32_t read32be(std::string_view data, std::size_t& offset) {
+  if (offset > data.size() || data.size() - offset < 4)
+    return NEED_MORE;
+  uint32_t c = static_cast<uint32_t>(data[offset]) << 24
+               | static_cast<uint32_t>(data[offset + 1] & 0xff) << 16
+               | static_cast<uint32_t>(data[offset + 2] & 0xff) << 8
+               | static_cast<uint32_t>(data[offset + 3] & 0xff);
+  if (valid_codepoint(c)) {
+    offset += 4;
+    return c;
+  }
+  return INVALID;
+}
+
+uint32_t read32le(std::string_view data, std::size_t& offset) {
+  if (offset > data.size() || data.size() - offset < 4)
+    return NEED_MORE;
+  uint32_t c = static_cast<uint32_t>(data[offset + 3]) << 24
+               | static_cast<uint32_t>(data[offset + 2] & 0xff) << 16
+               | static_cast<uint32_t>(data[offset + 1] & 0xff) << 8
+               | static_cast<uint32_t>(data[offset] & 0xff);
+  if (valid_codepoint(c)) {
+    offset += 4;
+    return c;
+  }
+  return INVALID;
+}
+
+}  // namespace utf
diff --git a/utf/src/utf8.cc b/utf/src/utf8.cc
new file mode 100644
index 0000000..54b0296
--- /dev/null
+++ b/utf/src/utf8.cc
@@ -0,0 +1,68 @@
+#include "utf8.hh"
+
+#include "utf_error.hh"
+
+namespace utf {
+
+namespace {
+
+inline bool valid_codepoint(uint32_t c) {
+  return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff);
+}
+
+}  // namespace
+
+uint32_t read8(std::string_view data, std::size_t& offset) {
+  if (offset >= data.size())
+    return NEED_MORE;
+  uint32_t ret;
+  uint8_t size;
+  switch (static_cast<uint8_t>(data[offset]) >> 4) {
+    case 15:
+      if (data[offset] & 0x08)
+        return INVALID;
+      ret = static_cast<uint32_t>(data[offset] & 0x07) << 18;
+      size = 4;
+      break;
+    case 14:
+      ret = static_cast<uint32_t>(data[offset] & 0x0f) << 12;
+      size = 3;
+      break;
+    case 13:
+    case 12:
+      ret = static_cast<uint32_t>(data[offset] & 0x1f) << 6;
+      size = 2;
+      break;
+    default:
+      if (data[offset] & 0x80)
+        return INVALID;
+      return data[offset++];
+  }
+  if (data.size() - offset < size)
+    return NEED_MORE;
+  for (uint8_t i = 1; i < size; ++i) {
+    if ((data[offset + i] & 0xc0) != 0x80)
+      return INVALID;
+    ret |= static_cast<uint32_t>(data[offset + i] & 0x3f) << (size - i - 1) * 6;
+  }
+  if (!valid_codepoint(ret))
+    return INVALID;
+  switch (size) {
+    case 4:
+      if (ret < 0x10000)
+        return INVALID;
+      break;
+    case 3:
+      if (ret < 0x800)
+        return INVALID;
+      break;
+    case 2:
+      if (ret < 0x80)
+        return INVALID;
+      break;
+  }
+  offset += size;
+  return ret;
+}
+
+}  // namespace utf
diff --git a/utf/tst/test_utf16.cc b/utf/tst/test_utf16.cc
new file mode 100644
index 0000000..c17982e
--- /dev/null
+++ b/utf/tst/test_utf16.cc
@@ -0,0 +1,157 @@
+#include "utf16.hh"
+
+#include "utf_error.hh"
+
+#include <gtest/gtest.h>
+
+TEST(utf16be, sanity) {
+  std::string_view str("\x00\x24", 2);
+  size_t offset = 0;
+  auto ret = utf::read16be(str, offset);
+  EXPECT_EQ('$', ret);
+  EXPECT_EQ(2, offset);
+
+  str = "\x20\xAC";
+  offset = 0;
+  ret = utf::read16be(str, offset);
+  EXPECT_EQ(0x20AC, ret);
+  EXPECT_EQ(2, offset);
+
+  str = "\xD8\x01\xDC\x37";
+  offset = 0;
+  ret = utf::read16be(str, offset);
+  EXPECT_EQ(0x10437, ret);
+  EXPECT_EQ(4, offset);
+
+  str = "\xD8\x52\xDF\x62";
+  offset = 0;
+  ret = utf::read16be(str, offset);
+  EXPECT_EQ(0x24B62, ret);
+  EXPECT_EQ(4, offset);
+}
+
+TEST(utf16le, sanity) {
+  std::string_view str("\x24\x00", 2);
+  size_t offset = 0;
+  auto ret = utf::read16le(str, offset);
+  EXPECT_EQ('$', ret);
+  EXPECT_EQ(2, offset);
+
+  str = "\xAC\x20";
+  offset = 0;
+  ret = utf::read16le(str, offset);
+  EXPECT_EQ(0x20AC, ret);
+  EXPECT_EQ(2, offset);
+
+  str = "\x01\xD8\x37\xDC";
+  offset = 0;
+  ret = utf::read16le(str, offset);
+  EXPECT_EQ(0x10437, ret);
+  EXPECT_EQ(4, offset);
+
+  str = "\x52\xD8\x62\xDF";
+  offset = 0;
+  ret = utf::read16le(str, offset);
+  EXPECT_EQ(0x24B62, ret);
+  EXPECT_EQ(4, offset);
+}
+
+TEST(utf16be, bom) {
+  std::string_view str("\xFE\xFF\x20\xAC");
+  size_t offset = 0;
+  auto ret = utf::read16be(str, offset);
+  EXPECT_EQ(0xFEFF, ret);
+  ret = utf::read16be(str, offset);
+  EXPECT_EQ(0x20AC, ret);
+  ret = utf::read16be(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(str.size(), offset);
+}
+
+TEST(utf16le, bom) {
+  std::string_view str("\xFF\xFE\xAC\x20");
+  size_t offset = 0;
+  auto ret = utf::read16le(str, offset);
+  EXPECT_EQ(0xFEFF, ret);
+  ret = utf::read16le(str, offset);
+  EXPECT_EQ(0x20AC, ret);
+  ret = utf::read16le(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(str.size(), offset);
+}
+
+TEST(utf16be, invalid) {
+  std::string_view str("\xD8");
+  size_t offset = 0;
+  auto ret = utf::read16be(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "";
+  offset = 0;
+  ret = utf::read16be(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\xD8\x01";
+  offset = 0;
+  ret = utf::read16be(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\xD8\x01\xDC";
+  offset = 0;
+  ret = utf::read16be(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\xDC\x37\xD8\x01";
+  offset = 0;
+  ret = utf::read16be(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\xD8\x01\xD8\x01";
+  offset = 0;
+  ret = utf::read16be(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+}
+
+TEST(utf16le, invalid) {
+  std::string_view str("\x01");
+  size_t offset = 0;
+  auto ret = utf::read16le(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "";
+  offset = 0;
+  ret = utf::read16le(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\x01\xD8";
+  offset = 0;
+  ret = utf::read16le(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\x01\xD8\x37";
+  offset = 0;
+  ret = utf::read16le(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\x37\xDC\x01\xD8";
+  offset = 0;
+  ret = utf::read16le(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\x01\xD8\x01\xD8";
+  offset = 0;
+  ret = utf::read16le(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+}
diff --git a/utf/tst/test_utf32.cc b/utf/tst/test_utf32.cc
new file mode 100644
index 0000000..796b4cd
--- /dev/null
+++ b/utf/tst/test_utf32.cc
@@ -0,0 +1,145 @@
+#include "utf32.hh"
+
+#include "utf_error.hh"
+
+#include <gtest/gtest.h>
+
+TEST(utf32be, sanity) {
+  std::string_view str("\x00\x00\x00\x24", 4);
+  size_t offset = 0;
+  auto ret = utf::read32be(str, offset);
+  EXPECT_EQ('$', ret);
+  EXPECT_EQ(4, offset);
+
+  str = std::string_view("\x00\x00\x20\xAC", 4);
+  offset = 0;
+  ret = utf::read32be(str, offset);
+  EXPECT_EQ(0x20AC, ret);
+  EXPECT_EQ(4, offset);
+
+  str = std::string_view("\x00\x01\x04\x37", 4);
+  offset = 0;
+  ret = utf::read32be(str, offset);
+  EXPECT_EQ(0x10437, ret);
+  EXPECT_EQ(4, offset);
+}
+
+TEST(utf32le, sanity) {
+  std::string_view str("\x24\x00\x00\x00", 4);
+  size_t offset = 0;
+  auto ret = utf::read32le(str, offset);
+  EXPECT_EQ('$', ret);
+  EXPECT_EQ(4, offset);
+
+  str = std::string_view("\xAC\x20\x00\x00", 4);
+  offset = 0;
+  ret = utf::read32le(str, offset);
+  EXPECT_EQ(0x20AC, ret);
+  EXPECT_EQ(4, offset);
+
+  str = std::string_view("\x37\x04\x01\x00", 4);
+  offset = 0;
+  ret = utf::read32le(str, offset);
+  EXPECT_EQ(0x10437, ret);
+  EXPECT_EQ(4, offset);
+}
+
+TEST(utf32be, invalid) {
+  std::string_view str("\xFF\xFF\xFF\xFF");
+  size_t offset = 0;
+  auto ret = utf::read32be(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+
+  str = std::string_view("\x00\x00\xD8\x00", 4);
+  offset = 0;
+  ret = utf::read32be(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "";
+  offset = 0;
+  ret = utf::read32be(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = std::string_view("\x00", 1);
+  offset = 0;
+  ret = utf::read32be(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = std::string_view("\x00\x00", 2);
+  offset = 0;
+  ret = utf::read32be(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = std::string_view("\x00\x00\x00", 3);
+  offset = 0;
+  ret = utf::read32be(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+}
+
+TEST(utf32le, invalid) {
+  std::string_view str("\xFF\xFF\xFF\xFF");
+  size_t offset = 0;
+  auto ret = utf::read32le(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+
+  str = std::string_view("\x00\xD8\x00\x00", 4);
+  offset = 0;
+  ret = utf::read32le(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "";
+  offset = 0;
+  ret = utf::read32le(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = std::string_view("\x00", 1);
+  offset = 0;
+  ret = utf::read32le(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = std::string_view("\x00\x00", 2);
+  offset = 0;
+  ret = utf::read32le(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = std::string_view("\x00\x00\x00", 3);
+  offset = 0;
+  ret = utf::read32le(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+}
+
+TEST(utf32be, bom) {
+  std::string_view str("\x00\x00\xFF\xFE\x00\x00\x20\xAC", 8);
+  size_t offset = 0;
+  auto ret = utf::read32be(str, offset);
+  EXPECT_EQ(0xFFFE, ret);
+  ret = utf::read32be(str, offset);
+  EXPECT_EQ(0x20AC, ret);
+  ret = utf::read32be(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(str.size(), offset);
+}
+
+TEST(utf32le, bom) {
+  std::string_view str("\xFE\xFF\x00\x00\xAC\x20\x00\x00", 8);
+  size_t offset = 0;
+  auto ret = utf::read32le(str, offset);
+  EXPECT_EQ(0xFFFE, ret);
+  ret = utf::read32le(str, offset);
+  EXPECT_EQ(0x20AC, ret);
+  ret = utf::read32le(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(str.size(), offset);
+}
diff --git a/utf/tst/test_utf8.cc b/utf/tst/test_utf8.cc
new file mode 100644
index 0000000..10df969
--- /dev/null
+++ b/utf/tst/test_utf8.cc
@@ -0,0 +1,188 @@
+#include "utf8.hh"
+
+#include "utf_error.hh"
+
+#include <gtest/gtest.h>
+
+TEST(utf8, sanity) {
+  std::string_view str("$");
+  size_t offset = 0;
+  auto ret = utf::read8(str, offset);
+  EXPECT_EQ('$', ret);
+  EXPECT_EQ(1, offset);
+
+  str = "\xC2\xA3";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(0xa3, ret);
+  EXPECT_EQ(2, offset);
+
+  str = "\xD0\x98";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(0x418, ret);
+  EXPECT_EQ(2, offset);
+
+  str = "\xE0\xA4\xB9";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(0x939, ret);
+  EXPECT_EQ(3, offset);
+
+  str = "\xE2\x82\xAC";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(0x20AC, ret);
+  EXPECT_EQ(3, offset);
+
+  str = "\xED\x95\x9C";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(0xD55C, ret);
+  EXPECT_EQ(3, offset);
+
+  str = "\xF0\x90\x8D\x88";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(0x10348, ret);
+  EXPECT_EQ(4, offset);
+}
+
+TEST(utf8, overlong) {
+  std::string_view str("\xF0\x82\x82\xAC");
+  size_t offset = 0;
+  auto ret = utf::read8(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\xE0\x81\x81";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\xC0\x80";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+}
+
+TEST(utf8, invalid) {
+  std::string_view str("\xED\xB0\x80");
+  size_t offset = 0;
+  auto ret = utf::read8(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\xFB\xFF\xFF";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\xFF\xFF\xFF\xFF\xFF";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\x80";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\xC2";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\xC2\x03";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(utf::INVALID, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\xE0\xA4";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+
+  str = "\xF0\x90\x8D";
+  offset = 0;
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(0, offset);
+}
+
+TEST(utf8, multiple1) {
+  std::string_view str("\x4D\xC3\xAC\x6E\x68\x20\x6E\xC3\xB3\x69\x20\x74\x69"
+                       "\xE1\xBA\xBF\x6E\x67\x20\x56\x69\xE1\xBB\x87\x74");
+  size_t offset = 0;
+  auto ret = utf::read8(str, offset);
+  EXPECT_EQ('M', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(0xEC, ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ('n', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ('h', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(' ', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ('n', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(0xF3, ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ('i', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(' ', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ('t', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ('i', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(0x1EBF, ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ('n', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ('g', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(' ', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ('V', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ('i', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(0x1EC7, ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ('t', ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(str.size(), offset);
+}
+
+TEST(utf8, multiple2) {
+  std::string_view str("\xF0\xA8\x89\x9F\xE5\x91\x90\xE3\x97\x82\xE8\xB6\x8A");
+  size_t offset = 0;
+  auto ret = utf::read8(str, offset);
+  EXPECT_EQ(0x2825F, ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(0x5450, ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(0x35C2, ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(0x8D8A, ret);
+  ret = utf::read8(str, offset);
+  EXPECT_EQ(utf::NEED_MORE, ret);
+  EXPECT_EQ(str.size(), offset);
+}
author	Joel Klinghed <the_jk@spawned.biz>	2023-06-13 10:07:16 +0200
committer	Joel Klinghed <the_jk@spawned.biz>	2023-06-13 10:07:16 +0200
commit	fc4547b412e28164af1bf8981234c6af959ccc0b (patch)
tree	061253e7a4f6abaca282223b36d10f0bed8cad23 /utf