summaryrefslogtreecommitdiff
path: root/utf/tst/test_utf8.cc
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@spawned.biz>2023-06-13 10:07:16 +0200
committerJoel Klinghed <the_jk@spawned.biz>2023-06-13 10:07:16 +0200
commitfc4547b412e28164af1bf8981234c6af959ccc0b (patch)
tree061253e7a4f6abaca282223b36d10f0bed8cad23 /utf/tst/test_utf8.cc
WIP
Diffstat (limited to 'utf/tst/test_utf8.cc')
-rw-r--r--utf/tst/test_utf8.cc188
1 files changed, 188 insertions, 0 deletions
diff --git a/utf/tst/test_utf8.cc b/utf/tst/test_utf8.cc
new file mode 100644
index 0000000..10df969
--- /dev/null
+++ b/utf/tst/test_utf8.cc
@@ -0,0 +1,188 @@
+#include "utf8.hh"
+
+#include "utf_error.hh"
+
+#include <gtest/gtest.h>
+
+TEST(utf8, sanity) {
+ std::string_view str("$");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ('$', ret);
+ EXPECT_EQ(1, offset);
+
+ str = "\xC2\xA3";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0xa3, ret);
+ EXPECT_EQ(2, offset);
+
+ str = "\xD0\x98";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x418, ret);
+ EXPECT_EQ(2, offset);
+
+ str = "\xE0\xA4\xB9";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x939, ret);
+ EXPECT_EQ(3, offset);
+
+ str = "\xE2\x82\xAC";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x20AC, ret);
+ EXPECT_EQ(3, offset);
+
+ str = "\xED\x95\x9C";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0xD55C, ret);
+ EXPECT_EQ(3, offset);
+
+ str = "\xF0\x90\x8D\x88";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x10348, ret);
+ EXPECT_EQ(4, offset);
+}
+
+TEST(utf8, overlong) {
+ std::string_view str("\xF0\x82\x82\xAC");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xE0\x81\x81";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xC0\x80";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+}
+
+TEST(utf8, invalid) {
+ std::string_view str("\xED\xB0\x80");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xFB\xFF\xFF";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xFF\xFF\xFF\xFF\xFF";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\x80";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xC2";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xC2\x03";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::INVALID, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xE0\xA4";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+
+ str = "\xF0\x90\x8D";
+ offset = 0;
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(0, offset);
+}
+
+TEST(utf8, multiple1) {
+ std::string_view str("\x4D\xC3\xAC\x6E\x68\x20\x6E\xC3\xB3\x69\x20\x74\x69"
+ "\xE1\xBA\xBF\x6E\x67\x20\x56\x69\xE1\xBB\x87\x74");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ('M', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0xEC, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('n', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('h', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(' ', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('n', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0xF3, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('i', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(' ', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('t', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('i', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x1EBF, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('n', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('g', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(' ', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('V', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('i', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x1EC7, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ('t', ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(str.size(), offset);
+}
+
+TEST(utf8, multiple2) {
+ std::string_view str("\xF0\xA8\x89\x9F\xE5\x91\x90\xE3\x97\x82\xE8\xB6\x8A");
+ size_t offset = 0;
+ auto ret = utf::read8(str, offset);
+ EXPECT_EQ(0x2825F, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x5450, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x35C2, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(0x8D8A, ret);
+ ret = utf::read8(str, offset);
+ EXPECT_EQ(utf::NEED_MORE, ret);
+ EXPECT_EQ(str.size(), offset);
+}