blob: 208dd33289defee2e70ac7411d4d69f813cacc3f (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
// -*- mode: c++; c-basic-offset: 2; -*-
#include "common.hh"
#include <string.h>
#include "utf.hh"
char* read_utf8(char const* in, size_t max, uint32_t* out) {
if (!in) return nullptr;
if (max == 0) return const_cast<char*>(in);
auto d = reinterpret_cast<uint8_t*>(const_cast<char*>(in));
switch (*d >> 4) {
case 0:
if (*d == 0) {
if (out) *out = 0;
break;
}
// Fallthrough
default:
if (out) *out = *d;
++d;
break;
case 12:
case 13: {
if (max < 2) return nullptr;
if ((d[1] & 0xc0) != 0x80) return nullptr;
uint32_t tmp = (d[0] & 0x1f) << 6 | (d[1] & 0x3f);
if (tmp < 0x80) return nullptr;
if (out) *out = tmp;
d += 2;
break;
}
case 14: {
if (max < 3) return nullptr;
if ((d[1] & 0xc0) != 0x80) return nullptr;
if ((d[2] & 0xc0) != 0x80) return nullptr;
uint32_t tmp = (d[0] & 0xf) << 12 | (d[1] & 0x3f) << 6 | (d[2] & 0x3f);
if (tmp < 0x800) return nullptr;
if (out) *out = tmp;
d += 3;
break;
}
case 15: {
if (max < 4) return nullptr;
if ((d[1] & 0xc0) != 0x80) return nullptr;
if ((d[2] & 0xc0) != 0x80) return nullptr;
if ((d[3] & 0xc0) != 0x80) return nullptr;
uint32_t tmp = (d[0] & 0x7) << 18 | (d[1] & 0x3f) << 12
| (d[2] & 0x3f) << 6 | (d[3] & 0x3f);
if (tmp < 0x10000) return nullptr;
if (out) *out = tmp;
d += 4;
break;
}
}
return reinterpret_cast<char*>(d);
}
bool valid_utf8(std::string const& str, size_t start, size_t len) {
if (start == 0) return valid_utf8(str.data(), std::min(str.size(), len));
if (start > str.size()) start = str.size();
if (len == std::string::npos || start + len > str.size()) {
len = str.size() - start;
}
return valid_utf8(str.data() + start, len);
}
bool valid_utf8(char const* str, size_t len) {
if (len == 0) return true;
if (!str) {
assert(false);
return false;
}
if (len == std::string::npos) {
len = strlen(str);
if (len == 0) return true;
}
auto end = str + len;
while (str < end) {
auto next = read_utf8(str, end - str, nullptr);
if (!next) return false;
if (next == str) return false; // \0 are unexpected here
str = next;
}
return true;
}
|