diff options
Diffstat (limited to 'src/utf.cc')
| -rw-r--r-- | src/utf.cc | 87 |
1 files changed, 87 insertions, 0 deletions
diff --git a/src/utf.cc b/src/utf.cc new file mode 100644 index 0000000..208dd33 --- /dev/null +++ b/src/utf.cc @@ -0,0 +1,87 @@ +// -*- mode: c++; c-basic-offset: 2; -*- + +#include "common.hh" + +#include <string.h> + +#include "utf.hh" + +char* read_utf8(char const* in, size_t max, uint32_t* out) { + if (!in) return nullptr; + if (max == 0) return const_cast<char*>(in); + auto d = reinterpret_cast<uint8_t*>(const_cast<char*>(in)); + switch (*d >> 4) { + case 0: + if (*d == 0) { + if (out) *out = 0; + break; + } + // Fallthrough + default: + if (out) *out = *d; + ++d; + break; + case 12: + case 13: { + if (max < 2) return nullptr; + if ((d[1] & 0xc0) != 0x80) return nullptr; + uint32_t tmp = (d[0] & 0x1f) << 6 | (d[1] & 0x3f); + if (tmp < 0x80) return nullptr; + if (out) *out = tmp; + d += 2; + break; + } + case 14: { + if (max < 3) return nullptr; + if ((d[1] & 0xc0) != 0x80) return nullptr; + if ((d[2] & 0xc0) != 0x80) return nullptr; + uint32_t tmp = (d[0] & 0xf) << 12 | (d[1] & 0x3f) << 6 | (d[2] & 0x3f); + if (tmp < 0x800) return nullptr; + if (out) *out = tmp; + d += 3; + break; + } + case 15: { + if (max < 4) return nullptr; + if ((d[1] & 0xc0) != 0x80) return nullptr; + if ((d[2] & 0xc0) != 0x80) return nullptr; + if ((d[3] & 0xc0) != 0x80) return nullptr; + uint32_t tmp = (d[0] & 0x7) << 18 | (d[1] & 0x3f) << 12 + | (d[2] & 0x3f) << 6 | (d[3] & 0x3f); + if (tmp < 0x10000) return nullptr; + if (out) *out = tmp; + d += 4; + break; + } + } + return reinterpret_cast<char*>(d); +} + +bool valid_utf8(std::string const& str, size_t start, size_t len) { + if (start == 0) return valid_utf8(str.data(), std::min(str.size(), len)); + if (start > str.size()) start = str.size(); + if (len == std::string::npos || start + len > str.size()) { + len = str.size() - start; + } + return valid_utf8(str.data() + start, len); +} + +bool valid_utf8(char const* str, size_t len) { + if (len == 0) return true; + if (!str) { + assert(false); + return false; + } + if (len == std::string::npos) { + len = strlen(str); + if (len == 0) return true; + } + auto end = str + len; + while (str < end) { + auto next = read_utf8(str, end - str, nullptr); + if (!next) return false; + if (next == str) return false; // \0 are unexpected here + str = next; + } + return true; +} |
