From be1dc86b5434e84a88ba2e62873d87c62f8880c0 Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Sun, 6 Aug 2017 22:07:03 +0200 Subject: Add utf module Currently only has valid_utf8 and read_utf8 --- src/Makefile.am | 2 +- src/utf.cc | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/utf.hh | 14 +++++++++ test/.gitignore | 1 + test/Makefile.am | 5 +++- test/test-utf.cc | 64 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 171 insertions(+), 2 deletions(-) create mode 100644 src/utf.cc create mode 100644 src/utf.hh create mode 100644 test/test-utf.cc diff --git a/src/Makefile.am b/src/Makefile.am index 4aaf8b3..9622262 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -36,7 +36,7 @@ libtp_a_SOURCES = args.cc args.hh xdg.cc xdg.hh terminal.cc terminal.hh \ character.cc character.hh config.cc config.hh \ strings.cc strings.hh io.cc io.hh looper.cc looper.hh \ buffer.cc buffer.hh chunked.cc chunked.hh \ - package.cc package.hh data.hh common.hh + package.cc package.hh data.hh common.hh utf.hh utf.cc if !HAVE_SSL libtp_a_SOURCES += mitm_stub.cc endif diff --git a/src/utf.cc b/src/utf.cc new file mode 100644 index 0000000..208dd33 --- /dev/null +++ b/src/utf.cc @@ -0,0 +1,87 @@ +// -*- mode: c++; c-basic-offset: 2; -*- + +#include "common.hh" + +#include + +#include "utf.hh" + +char* read_utf8(char const* in, size_t max, uint32_t* out) { + if (!in) return nullptr; + if (max == 0) return const_cast(in); + auto d = reinterpret_cast(const_cast(in)); + switch (*d >> 4) { + case 0: + if (*d == 0) { + if (out) *out = 0; + break; + } + // Fallthrough + default: + if (out) *out = *d; + ++d; + break; + case 12: + case 13: { + if (max < 2) return nullptr; + if ((d[1] & 0xc0) != 0x80) return nullptr; + uint32_t tmp = (d[0] & 0x1f) << 6 | (d[1] & 0x3f); + if (tmp < 0x80) return nullptr; + if (out) *out = tmp; + d += 2; + break; + } + case 14: { + if (max < 3) return nullptr; + if ((d[1] & 0xc0) != 0x80) return nullptr; + if ((d[2] & 0xc0) != 0x80) return nullptr; + uint32_t tmp = (d[0] & 0xf) << 12 | (d[1] & 0x3f) << 6 | (d[2] & 0x3f); + if (tmp < 0x800) return nullptr; + if (out) *out = tmp; + d += 3; + break; + } + case 15: { + if (max < 4) return nullptr; + if ((d[1] & 0xc0) != 0x80) return nullptr; + if ((d[2] & 0xc0) != 0x80) return nullptr; + if ((d[3] & 0xc0) != 0x80) return nullptr; + uint32_t tmp = (d[0] & 0x7) << 18 | (d[1] & 0x3f) << 12 + | (d[2] & 0x3f) << 6 | (d[3] & 0x3f); + if (tmp < 0x10000) return nullptr; + if (out) *out = tmp; + d += 4; + break; + } + } + return reinterpret_cast(d); +} + +bool valid_utf8(std::string const& str, size_t start, size_t len) { + if (start == 0) return valid_utf8(str.data(), std::min(str.size(), len)); + if (start > str.size()) start = str.size(); + if (len == std::string::npos || start + len > str.size()) { + len = str.size() - start; + } + return valid_utf8(str.data() + start, len); +} + +bool valid_utf8(char const* str, size_t len) { + if (len == 0) return true; + if (!str) { + assert(false); + return false; + } + if (len == std::string::npos) { + len = strlen(str); + if (len == 0) return true; + } + auto end = str + len; + while (str < end) { + auto next = read_utf8(str, end - str, nullptr); + if (!next) return false; + if (next == str) return false; // \0 are unexpected here + str = next; + } + return true; +} diff --git a/src/utf.hh b/src/utf.hh new file mode 100644 index 0000000..7625b1b --- /dev/null +++ b/src/utf.hh @@ -0,0 +1,14 @@ +// -*- mode: c++; c-basic-offset: 2; -*- + +#ifndef UTF_HH +#define UTF_HH + +#include + +char* read_utf8(char const* in, size_t max, uint32_t* out); + +bool valid_utf8(std::string const& str, size_t start = 0, + size_t len = std::string::npos); +bool valid_utf8(char const* str, size_t len = std::string::npos); + +#endif // UTF_HH diff --git a/test/.gitignore b/test/.gitignore index 8bf01f1..f1a5a11 100644 --- a/test/.gitignore +++ b/test/.gitignore @@ -10,3 +10,4 @@ /test-observers /test-htmlattrtext /test-package +/test-utf diff --git a/test/Makefile.am b/test/Makefile.am index abc1f9c..5248547 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -3,7 +3,7 @@ MAINTAINERCLEANFILES = Makefile.in AM_CXXFLAGS = @DEFINES@ -I$(top_srcdir)/src TESTS = test-url test-http test-args test-xdg test-paths test-strings \ - test-observers test-htmlattrtext test-package + test-observers test-htmlattrtext test-package test-utf check_PROGRAMS = $(TESTS) @@ -32,3 +32,6 @@ test_htmlattrtext_LDADD = $(top_builddir)/src/libattrstr.a test_package_SOURCES = test-package.cc test.hh test_package_LDADD = $(top_builddir)/src/libtp.a + +test_utf_SOURCES = test-utf.cc test.hh +test_utf_LDADD = $(top_builddir)/src/libtp.a diff --git a/test/test-utf.cc b/test/test-utf.cc new file mode 100644 index 0000000..7096a47 --- /dev/null +++ b/test/test-utf.cc @@ -0,0 +1,64 @@ +// -*- mode: c++; c-basic-offset: 2; -*- + +#include "common.hh" +#include "test.hh" + +#include + +#include "utf.hh" + +namespace { + +bool test_empty() { + ASSERT_EQ(true, valid_utf8("")); + ASSERT_EQ(true, valid_utf8("", 0)); + ASSERT_EQ(true, valid_utf8("\xff", 0)); + auto str = ""; + ASSERT_EQ(str, read_utf8(str, 0, nullptr)); + ASSERT_EQ(str, read_utf8(str, 10, nullptr)); + return true; +} + +bool test_good(char const* str, ...) { + ASSERT_EQ(true, valid_utf8(str)); + va_list args; + va_start(args, str); + auto pos = str; + auto end = str + strlen(str) + 1; + uint32_t value; + while (true) { + uint32_t expected = va_arg(args, uint32_t); + auto next = read_utf8(pos, end - pos, &value); + if (expected == 0) { + ASSERT_EQ(pos, next); + ASSERT_EQ(expected, value); + break; + } else { + ASSERT_EQ(false, pos == next); + ASSERT_EQ(false, next == nullptr); + ASSERT_EQ(expected, value); + } + pos = next; + } + va_end(args); + return true; +} + +bool test_bad(char const* str) { + ASSERT_EQ(false, valid_utf8(str)); + return true; +} + +} // namespace + +int main(void) { + BEFORE; + RUN(test_empty()); + RUN(test_good("$", 0x24, 0)); + RUN(test_good("\xc2\xa2", 0xa2, 0)); + RUN(test_good("\xe2\x82\xac", 0x20ac, 0)); + RUN(test_good("\xf0\x90\x8d\x88", 0x10348, 0)); + RUN(test_bad("\xf0\x82\x82\xac")); + AFTER; +} + -- cgit v1.2.3-70-g09d2