summaryrefslogtreecommitdiff
path: root/sax/src/sax_processor.cc
blob: ea9f75307f2a96868259d2a924b233c64db30fd8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#include "sax_processor.hh"

#include "sax_decoder.hh"
#include "processor.hh"
#include "utils.hh"

#include <algorithm>
#include <optional>
#include <utility>

namespace modxml {
namespace sax {

namespace {

// 2.2 Characters
// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]

inline bool valid_char(uint32_t c) {
  // Assume valid unicode (U+0 - U+10ffff except surrogate blocks)
  if (c >= 0x20 && c <= 0xfffd)
    return true;
  if (c == 0x9 || c == 0xa || c == 0xd)
    return true;
  return c >= 0x10000;
}

// 2.3 Common Syntactic Constructs
// [3] S ::= (#x20 | #x9 | #xD | #xA)+

inline bool is_ws(uint32_t c) {
  // Assume we already checked for valid_char.
  return c <= 0x20;
}

// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
// [4a]	NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]

inline bool is_namestartchar(uint32_t c) {
  if (c < 0x41 /* A */)
    return c == 0x3a /* : */;
  if (c <= 0x5a /* Z */)
    return true;
  if (c < 0x61 /* a */)
    return c == 0x5f /* _ */;
  if (c <= 0x7a /* z */)
    return true;
  if (c < 0xc0)
    return false;
  if (c < 0x300)
    return c != 0xd7 && c != 0xf7;
  if (c > 0x37d && c < 0x37f)
    return false;
  if (c > 0x1fff && c < 0x200c)
    return false;
  if (c > 0x200d && c < 0x2070)
    return false;
  if (c > 0x218f && c < 0x2c00)
    return false;
  if (c > 0x2fef && c < 0x3001)
    return false;
  // Already valid_char so don't check for surrogate pair here.
  if (c > 0xdfff && c < 0xf900)
    return false;
  if (c > 0xfdcf && c < 0xfdf0)
    return false;
  if (c > 0xfffd && c < 0x10000)
    return false;
  return true;
}

inline bool is_namechar(uint32_t c) {
  return is_namestartchar(c) || (c >= 0x2d /* - */ && c <= 0x2e /* . */) ||
      (c >= 0x30 /* 0 */ && c <= 0x39 /* 9 */) || (c == 0xb7) ||
      (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040);
}

/* [5]   	Name	   ::=   	NameStartChar (NameChar)*
[6]   	Names	   ::=   	Name (#x20 Name)*
[7]   	Nmtoken	   ::=   	(NameChar)+
[8]   	Nmtokens	   ::=   	Nmtoken (#x20 Nmtoken)*
*/

class ProcessorImpl : public Processor {
 public:
  ProcessorImpl(std::shared_ptr<Delegate> delegate,
                std::shared_ptr<DecoderFactory> decoder_factory,
                std::unique_ptr<Decoder> decoder,
                std::size_t default_buffer_size,
                std::size_t max_buffer_size)
      : delegate_(std::move(delegate)),
        decoder_factory_(std::move(decoder_factory)),
        decoder_(std::move(decoder)),
        default_buffer_size_(default_buffer_size),
        max_buffer_size_(max_buffer_size) {}

 private:
  std::shared_ptr<Delegate> delegate_;
  std::shared_ptr<DecoderFactory> decoder_factory_;
  std::unique_ptr<Decoder> decoder_;
  std::size_t default_buffer_size_;
  std::size_t max_buffer_size_;
};

}  // namespace

std::unique_ptr<Processor> create_processor(
    std::shared_ptr<Delegate> delegate,
    std::shared_ptr<DecoderFactory> decoder_factory,
    std::optional<std::string> force_encoding,
    std::optional<std::size_t> opt_default_buffer_size,
    std::optional<std::size_t> opt_max_buffer_size) {

  std::unique_ptr<Decoder> decoder;
  if (force_encoding.has_value()) {
    decoder = pick_decoder_for_encoding(force_encoding.value(),
                                        decoder_factory.get());
  }

  std::size_t default_buffer_size = 8192;
  if (opt_default_buffer_size.has_value())
    default_buffer_size = std::max(static_cast<std::size_t>(128),
                                   opt_default_buffer_size.value());
  // This value is documented in public headers. Do NOT change.
  std::size_t max_buffer_size = 10 * 1024 * 1024;
  // No validation for user set value. If it is too small MAX_MEMORY_EXCEEDED
  // error will be thrown. If it is too large we will get OUT_OF_MEMORY or
  // crash depending on platform.
  if (opt_max_buffer_size.has_value())
    max_buffer_size = opt_max_buffer_size.value();

  return std::make_unique<ProcessorImpl>(std::move(delegate),
                                         std::move(decoder_factory),
                                         std::move(decoder),
                                         default_buffer_size,
                                         max_buffer_size);
}

std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate) {
  return create_processor(std::move(delegate), nullptr,
                          std::nullopt, std::nullopt, std::nullopt);
}

}  // namespace sax
}  // namespace modxml