blob: 0e444ae2d2599ee4a76534f7d85f5bbd578171d3 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
|
#include "utf8.hh"
#include "utf_error.hh"
namespace utf {
namespace {
inline bool valid_codepoint(uint32_t c) {
return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff);
}
} // namespace
uint32_t read8(std::span<uint8_t const> data, std::size_t& offset) {
if (offset >= data.size())
return NEED_MORE;
uint32_t ret;
uint8_t size;
switch (data[offset] >> 4) {
case 15:
if (data[offset] & 0x08)
return INVALID;
ret = static_cast<uint32_t>(data[offset] & 0x07) << 18;
size = 4;
break;
case 14:
ret = static_cast<uint32_t>(data[offset] & 0x0f) << 12;
size = 3;
break;
case 13:
case 12:
ret = static_cast<uint32_t>(data[offset] & 0x1f) << 6;
size = 2;
break;
default:
if (data[offset] & 0x80)
return INVALID;
return data[offset++];
}
if (data.size() - offset < size)
return NEED_MORE;
for (uint8_t i = 1; i < size; ++i) {
if ((data[offset + i] & 0xc0) != 0x80)
return INVALID;
ret |= static_cast<uint32_t>(data[offset + i] & 0x3f) << (size - i - 1) * 6;
}
if (!valid_codepoint(ret))
return INVALID;
switch (size) {
case 4:
if (ret < 0x10000)
return INVALID;
break;
case 3:
if (ret < 0x800)
return INVALID;
break;
case 2:
if (ret < 0x80)
return INVALID;
break;
}
offset += size;
return ret;
}
bool write8(uint32_t codepoint, std::span<uint8_t> data, std::size_t& offset) {
if (offset >= data.size()) UNLIKELY {
return false;
}
if (codepoint < 0x80) {
data[offset++] = codepoint;
} else if (codepoint < 0x800) {
if (data.size() - offset < 2) UNLIKELY {
return false;
}
data[offset++] = 0xc0 | (codepoint >> 6);
data[offset++] = 0x80 | (codepoint & 0x3f);
} else if (codepoint < 0x10000) {
if (data.size() - offset < 3) UNLIKELY {
return false;
}
data[offset++] = 0xe0 | (codepoint >> 12);
data[offset++] = 0x80 | ((codepoint >> 6) & 0x3f);
data[offset++] = 0x80 | (codepoint & 0x3f);
} else {
if (data.size() - offset < 4) UNLIKELY {
return false;
}
data[offset++] = 0xf0 | (codepoint >> 18);
data[offset++] = 0x80 | ((codepoint >> 12) & 0x3f);
data[offset++] = 0x80 | ((codepoint >> 6) & 0x3f);
data[offset++] = 0x80 | (codepoint & 0x3f);
}
return true;
}
} // namespace utf
|