summaryrefslogtreecommitdiff
path: root/src/utf.cc
blob: 208dd33289defee2e70ac7411d4d69f813cacc3f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
// -*- mode: c++; c-basic-offset: 2; -*-

#include "common.hh"

#include <string.h>

#include "utf.hh"

char* read_utf8(char const* in, size_t max, uint32_t* out) {
  if (!in) return nullptr;
  if (max == 0) return const_cast<char*>(in);
  auto d = reinterpret_cast<uint8_t*>(const_cast<char*>(in));
  switch (*d >> 4) {
  case 0:
    if (*d == 0) {
      if (out) *out = 0;
      break;
    }
    // Fallthrough
  default:
    if (out) *out = *d;
    ++d;
    break;
  case 12:
  case 13: {
    if (max < 2) return nullptr;
    if ((d[1] & 0xc0) != 0x80) return nullptr;
    uint32_t tmp = (d[0] & 0x1f) << 6 | (d[1] & 0x3f);
    if (tmp < 0x80) return nullptr;
    if (out) *out = tmp;
    d += 2;
    break;
  }
  case 14: {
    if (max < 3) return nullptr;
    if ((d[1] & 0xc0) != 0x80) return nullptr;
    if ((d[2] & 0xc0) != 0x80) return nullptr;
    uint32_t tmp = (d[0] & 0xf) << 12 | (d[1] & 0x3f) << 6 | (d[2] & 0x3f);
    if (tmp < 0x800) return nullptr;
    if (out) *out = tmp;
    d += 3;
    break;
  }
  case 15: {
    if (max < 4) return nullptr;
    if ((d[1] & 0xc0) != 0x80) return nullptr;
    if ((d[2] & 0xc0) != 0x80) return nullptr;
    if ((d[3] & 0xc0) != 0x80) return nullptr;
    uint32_t tmp = (d[0] & 0x7) << 18 | (d[1] & 0x3f) << 12
      | (d[2] & 0x3f) << 6 | (d[3] & 0x3f);
    if (tmp < 0x10000) return nullptr;
    if (out) *out = tmp;
    d += 4;
    break;
  }
  }
  return reinterpret_cast<char*>(d);
}

bool valid_utf8(std::string const& str, size_t start, size_t len) {
  if (start == 0) return valid_utf8(str.data(), std::min(str.size(), len));
  if (start > str.size()) start = str.size();
  if (len == std::string::npos || start + len > str.size()) {
    len = str.size() - start;
  }
  return valid_utf8(str.data() + start, len);
}

bool valid_utf8(char const* str, size_t len) {
  if (len == 0) return true;
  if (!str) {
    assert(false);
    return false;
  }
  if (len == std::string::npos) {
    len = strlen(str);
    if (len == 0) return true;
  }
  auto end = str + len;
  while (str < end) {
    auto next = read_utf8(str, end - str, nullptr);
    if (!next) return false;
    if (next == str) return false;  // \0 are unexpected here
    str = next;
  }
  return true;
}