summaryrefslogtreecommitdiff
path: root/sax/src/utils.cc
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@spawned.biz>2023-06-13 10:07:16 +0200
committerJoel Klinghed <the_jk@spawned.biz>2023-06-13 10:07:16 +0200
commitfc4547b412e28164af1bf8981234c6af959ccc0b (patch)
tree061253e7a4f6abaca282223b36d10f0bed8cad23 /sax/src/utils.cc
WIP
Diffstat (limited to 'sax/src/utils.cc')
-rw-r--r--sax/src/utils.cc70
1 files changed, 70 insertions, 0 deletions
diff --git a/sax/src/utils.cc b/sax/src/utils.cc
new file mode 100644
index 0000000..f0366d5
--- /dev/null
+++ b/sax/src/utils.cc
@@ -0,0 +1,70 @@
+#include "utils.hh"
+
+#include "decoder.hh"
+#include "sax_decoder.hh"
+#include "sax_decoder_factory.hh"
+
+namespace modxml {
+namespace sax {
+
+namespace {
+
+std::string cleanup_encoding(std::string const& str) {
+ std::string ret;
+ ret.reserve(str.size());
+ for (auto c : str) {
+ if (c >= 'A' && c <= 'Z') {
+ ret.push_back(c | 0x20);
+ } else if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
+ ret.push_back(c);
+ } else if (c == '.' || c == '_' || c == '-') {
+ ret.push_back('-');
+ }
+ }
+ return ret;
+}
+
+} // namespace
+
+// Names inspired by:
+// https://www.iana.org/assignments/character-sets/character-sets.xhtml
+std::unique_ptr<Decoder> pick_decoder_for_encoding(
+ std::string const& encoding, DecoderFactory* factory) {
+ auto clean_enc = cleanup_encoding(encoding);
+ if (clean_enc == "utf-8" || clean_enc == "utf8") {
+ return create_utf8_decoder();
+ }
+ if (clean_enc == "utf-16" || clean_enc == "utf16") {
+ return create_utf16_decoder();
+ }
+ if (clean_enc == "utf-16be" || clean_enc == "utf16be") {
+ return create_utf16be_decoder();
+ }
+ if (clean_enc == "utf-16le" || clean_enc == "utf16le") {
+ return create_utf16le_decoder();
+ }
+ if (clean_enc == "utf-32" || clean_enc == "utf32") {
+ return create_utf32_decoder();
+ }
+ if (clean_enc == "utf-32be" || clean_enc == "utf32be") {
+ return create_utf32be_decoder();
+ }
+ if (clean_enc == "utf-32le" || clean_enc == "utf32le") {
+ return create_utf32le_decoder();
+ }
+ if (clean_enc == "ascii" || clean_enc == "us-ascii" ||
+ clean_enc == "usascii" || clean_enc == "iso-ir-6" ||
+ clean_enc == "ansi-x3-4-1968" || clean_enc == "ansi-x3-4-1986" ||
+ clean_enc == "iso-646-irv1991" || clean_enc == "iso646-us" ||
+ clean_enc == "us" || clean_enc == "ibm367" || clean_enc == "cp367") {
+ return create_ascii_decoder();
+ }
+ if (factory) {
+ return factory->create(encoding);
+ }
+ return nullptr;
+}
+
+} // namespace sax
+
+} // namespace modxml