// -*- mode: c++; c-basic-offset: 2; -*- #include "common.hh" #include #include "utf.hh" char* read_utf8(char const* in, size_t max, uint32_t* out) { if (!in) return nullptr; if (max == 0) return const_cast(in); auto d = reinterpret_cast(const_cast(in)); switch (*d >> 4) { case 0: if (*d == 0) { if (out) *out = 0; break; } // Fallthrough default: if (out) *out = *d; ++d; break; case 12: case 13: { if (max < 2) return nullptr; if ((d[1] & 0xc0) != 0x80) return nullptr; uint32_t tmp = (d[0] & 0x1f) << 6 | (d[1] & 0x3f); if (tmp < 0x80) return nullptr; if (out) *out = tmp; d += 2; break; } case 14: { if (max < 3) return nullptr; if ((d[1] & 0xc0) != 0x80) return nullptr; if ((d[2] & 0xc0) != 0x80) return nullptr; uint32_t tmp = (d[0] & 0xf) << 12 | (d[1] & 0x3f) << 6 | (d[2] & 0x3f); if (tmp < 0x800) return nullptr; if (out) *out = tmp; d += 3; break; } case 15: { if (max < 4) return nullptr; if ((d[1] & 0xc0) != 0x80) return nullptr; if ((d[2] & 0xc0) != 0x80) return nullptr; if ((d[3] & 0xc0) != 0x80) return nullptr; uint32_t tmp = (d[0] & 0x7) << 18 | (d[1] & 0x3f) << 12 | (d[2] & 0x3f) << 6 | (d[3] & 0x3f); if (tmp < 0x10000) return nullptr; if (out) *out = tmp; d += 4; break; } } return reinterpret_cast(d); } bool valid_utf8(std::string const& str, size_t start, size_t len) { if (start == 0) return valid_utf8(str.data(), std::min(str.size(), len)); if (start > str.size()) start = str.size(); if (len == std::string::npos || start + len > str.size()) { len = str.size() - start; } return valid_utf8(str.data() + start, len); } bool valid_utf8(char const* str, size_t len) { if (len == 0) return true; if (!str) { assert(false); return false; } if (len == std::string::npos) { len = strlen(str); if (len == 0) return true; } auto end = str + len; while (str < end) { auto next = read_utf8(str, end - str, nullptr); if (!next) return false; if (next == str) return false; // \0 are unexpected here str = next; } return true; }