summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@yahoo.com>2017-08-06 22:07:03 +0200
committerJoel Klinghed <the_jk@yahoo.com>2017-08-06 22:24:59 +0200
commitbe1dc86b5434e84a88ba2e62873d87c62f8880c0 (patch)
treeba73db824c64890d196f843b0b72e11b0185e7fc /src
parentd935594d14eb65b55db0d4e96d66a7404f2130d0 (diff)
Add utf module
Currently only has valid_utf8 and read_utf8
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.am2
-rw-r--r--src/utf.cc87
-rw-r--r--src/utf.hh14
3 files changed, 102 insertions, 1 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index 4aaf8b3..9622262 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -36,7 +36,7 @@ libtp_a_SOURCES = args.cc args.hh xdg.cc xdg.hh terminal.cc terminal.hh \
character.cc character.hh config.cc config.hh \
strings.cc strings.hh io.cc io.hh looper.cc looper.hh \
buffer.cc buffer.hh chunked.cc chunked.hh \
- package.cc package.hh data.hh common.hh
+ package.cc package.hh data.hh common.hh utf.hh utf.cc
if !HAVE_SSL
libtp_a_SOURCES += mitm_stub.cc
endif
diff --git a/src/utf.cc b/src/utf.cc
new file mode 100644
index 0000000..208dd33
--- /dev/null
+++ b/src/utf.cc
@@ -0,0 +1,87 @@
+// -*- mode: c++; c-basic-offset: 2; -*-
+
+#include "common.hh"
+
+#include <string.h>
+
+#include "utf.hh"
+
+char* read_utf8(char const* in, size_t max, uint32_t* out) {
+ if (!in) return nullptr;
+ if (max == 0) return const_cast<char*>(in);
+ auto d = reinterpret_cast<uint8_t*>(const_cast<char*>(in));
+ switch (*d >> 4) {
+ case 0:
+ if (*d == 0) {
+ if (out) *out = 0;
+ break;
+ }
+ // Fallthrough
+ default:
+ if (out) *out = *d;
+ ++d;
+ break;
+ case 12:
+ case 13: {
+ if (max < 2) return nullptr;
+ if ((d[1] & 0xc0) != 0x80) return nullptr;
+ uint32_t tmp = (d[0] & 0x1f) << 6 | (d[1] & 0x3f);
+ if (tmp < 0x80) return nullptr;
+ if (out) *out = tmp;
+ d += 2;
+ break;
+ }
+ case 14: {
+ if (max < 3) return nullptr;
+ if ((d[1] & 0xc0) != 0x80) return nullptr;
+ if ((d[2] & 0xc0) != 0x80) return nullptr;
+ uint32_t tmp = (d[0] & 0xf) << 12 | (d[1] & 0x3f) << 6 | (d[2] & 0x3f);
+ if (tmp < 0x800) return nullptr;
+ if (out) *out = tmp;
+ d += 3;
+ break;
+ }
+ case 15: {
+ if (max < 4) return nullptr;
+ if ((d[1] & 0xc0) != 0x80) return nullptr;
+ if ((d[2] & 0xc0) != 0x80) return nullptr;
+ if ((d[3] & 0xc0) != 0x80) return nullptr;
+ uint32_t tmp = (d[0] & 0x7) << 18 | (d[1] & 0x3f) << 12
+ | (d[2] & 0x3f) << 6 | (d[3] & 0x3f);
+ if (tmp < 0x10000) return nullptr;
+ if (out) *out = tmp;
+ d += 4;
+ break;
+ }
+ }
+ return reinterpret_cast<char*>(d);
+}
+
+bool valid_utf8(std::string const& str, size_t start, size_t len) {
+ if (start == 0) return valid_utf8(str.data(), std::min(str.size(), len));
+ if (start > str.size()) start = str.size();
+ if (len == std::string::npos || start + len > str.size()) {
+ len = str.size() - start;
+ }
+ return valid_utf8(str.data() + start, len);
+}
+
+bool valid_utf8(char const* str, size_t len) {
+ if (len == 0) return true;
+ if (!str) {
+ assert(false);
+ return false;
+ }
+ if (len == std::string::npos) {
+ len = strlen(str);
+ if (len == 0) return true;
+ }
+ auto end = str + len;
+ while (str < end) {
+ auto next = read_utf8(str, end - str, nullptr);
+ if (!next) return false;
+ if (next == str) return false; // \0 are unexpected here
+ str = next;
+ }
+ return true;
+}
diff --git a/src/utf.hh b/src/utf.hh
new file mode 100644
index 0000000..7625b1b
--- /dev/null
+++ b/src/utf.hh
@@ -0,0 +1,14 @@
+// -*- mode: c++; c-basic-offset: 2; -*-
+
+#ifndef UTF_HH
+#define UTF_HH
+
+#include <string>
+
+char* read_utf8(char const* in, size_t max, uint32_t* out);
+
+bool valid_utf8(std::string const& str, size_t start = 0,
+ size_t len = std::string::npos);
+bool valid_utf8(char const* str, size_t len = std::string::npos);
+
+#endif // UTF_HH