Add utf module

Currently only has valid_utf8 and read_utf8
author: Joel Klinghed <the_jk@yahoo.com> 2017-08-06 22:07:03 +0200
committer: Joel Klinghed <the_jk@yahoo.com> 2017-08-06 22:24:59 +0200
commit: be1dc86b5434e84a88ba2e62873d87c62f8880c0 (patch)
tree: ba73db824c64890d196f843b0b72e11b0185e7fc
parent: d935594d14eb65b55db0d4e96d66a7404f2130d0 (diff)
6 files changed, 171 insertions, 2 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index 4aaf8b3..9622262 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -36,7 +36,7 @@ libtp_a_SOURCES = args.cc args.hh xdg.cc xdg.hh terminal.cc terminal.hh \
 			      character.cc character.hh config.cc config.hh \
                   strings.cc strings.hh io.cc io.hh looper.cc looper.hh \
 				  buffer.cc buffer.hh chunked.cc chunked.hh \
-                  package.cc package.hh data.hh common.hh
+                  package.cc package.hh data.hh common.hh utf.hh utf.cc
 if !HAVE_SSL
 libtp_a_SOURCES += mitm_stub.cc
 endif
diff --git a/src/utf.cc b/src/utf.cc
new file mode 100644
index 0000000..208dd33
--- /dev/null
+++ b/src/utf.cc
@@ -0,0 +1,87 @@
+// -*- mode: c++; c-basic-offset: 2; -*-
+
+#include "common.hh"
+
+#include <string.h>
+
+#include "utf.hh"
+
+char* read_utf8(char const* in, size_t max, uint32_t* out) {
+  if (!in) return nullptr;
+  if (max == 0) return const_cast<char*>(in);
+  auto d = reinterpret_cast<uint8_t*>(const_cast<char*>(in));
+  switch (*d >> 4) {
+  case 0:
+    if (*d == 0) {
+      if (out) *out = 0;
+      break;
+    }
+    // Fallthrough
+  default:
+    if (out) *out = *d;
+    ++d;
+    break;
+  case 12:
+  case 13: {
+    if (max < 2) return nullptr;
+    if ((d[1] & 0xc0) != 0x80) return nullptr;
+    uint32_t tmp = (d[0] & 0x1f) << 6 | (d[1] & 0x3f);
+    if (tmp < 0x80) return nullptr;
+    if (out) *out = tmp;
+    d += 2;
+    break;
+  }
+  case 14: {
+    if (max < 3) return nullptr;
+    if ((d[1] & 0xc0) != 0x80) return nullptr;
+    if ((d[2] & 0xc0) != 0x80) return nullptr;
+    uint32_t tmp = (d[0] & 0xf) << 12 | (d[1] & 0x3f) << 6 | (d[2] & 0x3f);
+    if (tmp < 0x800) return nullptr;
+    if (out) *out = tmp;
+    d += 3;
+    break;
+  }
+  case 15: {
+    if (max < 4) return nullptr;
+    if ((d[1] & 0xc0) != 0x80) return nullptr;
+    if ((d[2] & 0xc0) != 0x80) return nullptr;
+    if ((d[3] & 0xc0) != 0x80) return nullptr;
+    uint32_t tmp = (d[0] & 0x7) << 18 | (d[1] & 0x3f) << 12
+      | (d[2] & 0x3f) << 6 | (d[3] & 0x3f);
+    if (tmp < 0x10000) return nullptr;
+    if (out) *out = tmp;
+    d += 4;
+    break;
+  }
+  }
+  return reinterpret_cast<char*>(d);
+}
+
+bool valid_utf8(std::string const& str, size_t start, size_t len) {
+  if (start == 0) return valid_utf8(str.data(), std::min(str.size(), len));
+  if (start > str.size()) start = str.size();
+  if (len == std::string::npos || start + len > str.size()) {
+    len = str.size() - start;
+  }
+  return valid_utf8(str.data() + start, len);
+}
+
+bool valid_utf8(char const* str, size_t len) {
+  if (len == 0) return true;
+  if (!str) {
+    assert(false);
+    return false;
+  }
+  if (len == std::string::npos) {
+    len = strlen(str);
+    if (len == 0) return true;
+  }
+  auto end = str + len;
+  while (str < end) {
+    auto next = read_utf8(str, end - str, nullptr);
+    if (!next) return false;
+    if (next == str) return false;  // \0 are unexpected here
+    str = next;
+  }
+  return true;
+}
diff --git a/src/utf.hh b/src/utf.hh
new file mode 100644
index 0000000..7625b1b
--- /dev/null
+++ b/src/utf.hh
@@ -0,0 +1,14 @@
+// -*- mode: c++; c-basic-offset: 2; -*-
+
+#ifndef UTF_HH
+#define UTF_HH
+
+#include <string>
+
+char* read_utf8(char const* in, size_t max, uint32_t* out);
+
+bool valid_utf8(std::string const& str, size_t start = 0,
+                size_t len = std::string::npos);
+bool valid_utf8(char const* str, size_t len = std::string::npos);
+
+#endif  // UTF_HH
diff --git a/test/.gitignore b/test/.gitignore
index 8bf01f1..f1a5a11 100644
--- a/test/.gitignore
+++ b/test/.gitignore
@@ -10,3 +10,4 @@
 /test-observers
 /test-htmlattrtext
 /test-package
+/test-utf
diff --git a/test/Makefile.am b/test/Makefile.am
index abc1f9c..5248547 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -3,7 +3,7 @@ MAINTAINERCLEANFILES = Makefile.in
 AM_CXXFLAGS = @DEFINES@ -I$(top_srcdir)/src
 
 TESTS = test-url test-http test-args test-xdg test-paths test-strings \
-        test-observers test-htmlattrtext test-package
+        test-observers test-htmlattrtext test-package test-utf
 
 check_PROGRAMS = $(TESTS)
 
@@ -32,3 +32,6 @@ test_htmlattrtext_LDADD = $(top_builddir)/src/libattrstr.a
 
 test_package_SOURCES = test-package.cc test.hh
 test_package_LDADD = $(top_builddir)/src/libtp.a
+
+test_utf_SOURCES = test-utf.cc test.hh
+test_utf_LDADD = $(top_builddir)/src/libtp.a
diff --git a/test/test-utf.cc b/test/test-utf.cc
new file mode 100644
index 0000000..7096a47
--- /dev/null
+++ b/test/test-utf.cc
@@ -0,0 +1,64 @@
+// -*- mode: c++; c-basic-offset: 2; -*-
+
+#include "common.hh"
+#include "test.hh"
+
+#include <stdarg.h>
+
+#include "utf.hh"
+
+namespace {
+
+bool test_empty() {
+  ASSERT_EQ(true, valid_utf8(""));
+  ASSERT_EQ(true, valid_utf8("", 0));
+  ASSERT_EQ(true, valid_utf8("\xff", 0));
+  auto str = "";
+  ASSERT_EQ(str, read_utf8(str, 0, nullptr));
+  ASSERT_EQ(str, read_utf8(str, 10, nullptr));
+  return true;
+}
+
+bool test_good(char const* str, ...) {
+  ASSERT_EQ(true, valid_utf8(str));
+  va_list args;
+  va_start(args, str);
+  auto pos = str;
+  auto end = str + strlen(str) + 1;
+  uint32_t value;
+  while (true) {
+    uint32_t expected = va_arg(args, uint32_t);
+    auto next = read_utf8(pos, end - pos, &value);
+    if (expected == 0) {
+      ASSERT_EQ(pos, next);
+      ASSERT_EQ(expected, value);
+      break;
+    } else {
+      ASSERT_EQ(false, pos == next);
+      ASSERT_EQ(false, next == nullptr);
+      ASSERT_EQ(expected, value);
+    }
+    pos = next;
+  }
+  va_end(args);
+  return true;
+}
+
+bool test_bad(char const* str) {
+  ASSERT_EQ(false, valid_utf8(str));
+  return true;
+}
+
+}  // namespace
+
+int main(void) {
+  BEFORE;
+  RUN(test_empty());
+  RUN(test_good("$", 0x24, 0));
+  RUN(test_good("\xc2\xa2", 0xa2, 0));
+  RUN(test_good("\xe2\x82\xac", 0x20ac, 0));
+  RUN(test_good("\xf0\x90\x8d\x88", 0x10348, 0));
+  RUN(test_bad("\xf0\x82\x82\xac"));
+  AFTER;
+}
+
author	Joel Klinghed <the_jk@yahoo.com>	2017-08-06 22:07:03 +0200
committer	Joel Klinghed <the_jk@yahoo.com>	2017-08-06 22:24:59 +0200
commit	be1dc86b5434e84a88ba2e62873d87c62f8880c0 (patch)
tree	ba73db824c64890d196f843b0b72e11b0185e7fc
parent	d935594d14eb65b55db0d4e96d66a7404f2130d0 (diff)