summaryrefslogtreecommitdiff
path: root/sax/src/sax_processor.cc
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@spawned.biz>2023-06-13 10:07:16 +0200
committerJoel Klinghed <the_jk@spawned.biz>2023-06-13 10:07:16 +0200
commitfc4547b412e28164af1bf8981234c6af959ccc0b (patch)
tree061253e7a4f6abaca282223b36d10f0bed8cad23 /sax/src/sax_processor.cc
WIP
Diffstat (limited to 'sax/src/sax_processor.cc')
-rw-r--r--sax/src/sax_processor.cc145
1 files changed, 145 insertions, 0 deletions
diff --git a/sax/src/sax_processor.cc b/sax/src/sax_processor.cc
new file mode 100644
index 0000000..ea9f753
--- /dev/null
+++ b/sax/src/sax_processor.cc
@@ -0,0 +1,145 @@
+#include "sax_processor.hh"
+
+#include "sax_decoder.hh"
+#include "processor.hh"
+#include "utils.hh"
+
+#include <algorithm>
+#include <optional>
+#include <utility>
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+// 2.2 Characters
+// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+
+inline bool valid_char(uint32_t c) {
+ // Assume valid unicode (U+0 - U+10ffff except surrogate blocks)
+ if (c >= 0x20 && c <= 0xfffd)
+ return true;
+ if (c == 0x9 || c == 0xa || c == 0xd)
+ return true;
+ return c >= 0x10000;
+}
+
+// 2.3 Common Syntactic Constructs
+// [3] S ::= (#x20 | #x9 | #xD | #xA)+
+
+inline bool is_ws(uint32_t c) {
+ // Assume we already checked for valid_char.
+ return c <= 0x20;
+}
+
+// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
+// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+
+inline bool is_namestartchar(uint32_t c) {
+ if (c < 0x41 /* A */)
+ return c == 0x3a /* : */;
+ if (c <= 0x5a /* Z */)
+ return true;
+ if (c < 0x61 /* a */)
+ return c == 0x5f /* _ */;
+ if (c <= 0x7a /* z */)
+ return true;
+ if (c < 0xc0)
+ return false;
+ if (c < 0x300)
+ return c != 0xd7 && c != 0xf7;
+ if (c > 0x37d && c < 0x37f)
+ return false;
+ if (c > 0x1fff && c < 0x200c)
+ return false;
+ if (c > 0x200d && c < 0x2070)
+ return false;
+ if (c > 0x218f && c < 0x2c00)
+ return false;
+ if (c > 0x2fef && c < 0x3001)
+ return false;
+ // Already valid_char so don't check for surrogate pair here.
+ if (c > 0xdfff && c < 0xf900)
+ return false;
+ if (c > 0xfdcf && c < 0xfdf0)
+ return false;
+ if (c > 0xfffd && c < 0x10000)
+ return false;
+ return true;
+}
+
+inline bool is_namechar(uint32_t c) {
+ return is_namestartchar(c) || (c >= 0x2d /* - */ && c <= 0x2e /* . */) ||
+ (c >= 0x30 /* 0 */ && c <= 0x39 /* 9 */) || (c == 0xb7) ||
+ (c >= 0x300 && c <= 0x36f) || (c >= 0x203f && c <= 0x2040);
+}
+
+/* [5] Name ::= NameStartChar (NameChar)*
+[6] Names ::= Name (#x20 Name)*
+[7] Nmtoken ::= (NameChar)+
+[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
+*/
+
+class ProcessorImpl : public Processor {
+ public:
+ ProcessorImpl(std::shared_ptr<Delegate> delegate,
+ std::shared_ptr<DecoderFactory> decoder_factory,
+ std::unique_ptr<Decoder> decoder,
+ std::size_t default_buffer_size,
+ std::size_t max_buffer_size)
+ : delegate_(std::move(delegate)),
+ decoder_factory_(std::move(decoder_factory)),
+ decoder_(std::move(decoder)),
+ default_buffer_size_(default_buffer_size),
+ max_buffer_size_(max_buffer_size) {}
+
+ private:
+ std::shared_ptr<Delegate> delegate_;
+ std::shared_ptr<DecoderFactory> decoder_factory_;
+ std::unique_ptr<Decoder> decoder_;
+ std::size_t default_buffer_size_;
+ std::size_t max_buffer_size_;
+};
+
+} // namespace
+
+std::unique_ptr<Processor> create_processor(
+ std::shared_ptr<Delegate> delegate,
+ std::shared_ptr<DecoderFactory> decoder_factory,
+ std::optional<std::string> force_encoding,
+ std::optional<std::size_t> opt_default_buffer_size,
+ std::optional<std::size_t> opt_max_buffer_size) {
+
+ std::unique_ptr<Decoder> decoder;
+ if (force_encoding.has_value()) {
+ decoder = pick_decoder_for_encoding(force_encoding.value(),
+ decoder_factory.get());
+ }
+
+ std::size_t default_buffer_size = 8192;
+ if (opt_default_buffer_size.has_value())
+ default_buffer_size = std::max(static_cast<std::size_t>(128),
+ opt_default_buffer_size.value());
+ // This value is documented in public headers. Do NOT change.
+ std::size_t max_buffer_size = 10 * 1024 * 1024;
+ // No validation for user set value. If it is too small MAX_MEMORY_EXCEEDED
+ // error will be thrown. If it is too large we will get OUT_OF_MEMORY or
+ // crash depending on platform.
+ if (opt_max_buffer_size.has_value())
+ max_buffer_size = opt_max_buffer_size.value();
+
+ return std::make_unique<ProcessorImpl>(std::move(delegate),
+ std::move(decoder_factory),
+ std::move(decoder),
+ default_buffer_size,
+ max_buffer_size);
+}
+
+std::unique_ptr<Processor> create(std::shared_ptr<Delegate> delegate) {
+ return create_processor(std::move(delegate), nullptr,
+ std::nullopt, std::nullopt, std::nullopt);
+}
+
+} // namespace sax
+} // namespace modxml