#include "utils.hh" #include "decoder.hh" #include "sax_decoder.hh" #include "sax_decoder_factory.hh" namespace modxml { namespace sax { namespace { std::string cleanup_encoding(std::string_view str) { std::string ret; ret.reserve(str.size()); for (auto c : str) { if (c >= 'A' && c <= 'Z') { ret.push_back(c | 0x20); } else if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) { ret.push_back(c); } else if (c == '.' || c == '_' || c == '-') { ret.push_back('-'); } } return ret; } } // namespace // Names inspired by: // https://www.iana.org/assignments/character-sets/character-sets.xhtml std::unique_ptr pick_decoder_for_encoding( std::string_view encoding, DecoderFactory* factory) { auto clean_enc = cleanup_encoding(encoding); if (clean_enc == "utf-8" || clean_enc == "utf8") return create_utf8_decoder(); if (clean_enc == "utf-16" || clean_enc == "utf16") return create_utf16_decoder(); if (clean_enc == "utf-16be" || clean_enc == "utf16be") return create_utf16be_decoder(); if (clean_enc == "utf-16le" || clean_enc == "utf16le") return create_utf16le_decoder(); if (clean_enc == "utf-32" || clean_enc == "utf32") return create_utf32_decoder(); if (clean_enc == "utf-32be" || clean_enc == "utf32be") return create_utf32be_decoder(); if (clean_enc == "utf-32le" || clean_enc == "utf32le") return create_utf32le_decoder(); if (clean_enc == "ascii" || clean_enc == "us-ascii" || clean_enc == "usascii" || clean_enc == "iso-ir-6" || clean_enc == "ansi-x3-4-1968" || clean_enc == "ansi-x3-4-1986" || clean_enc == "iso-646-irv1991" || clean_enc == "iso646-us" || clean_enc == "us" || clean_enc == "ibm367" || clean_enc == "cp367") { return create_ascii_decoder(); } if (factory) return factory->create(encoding); return nullptr; } } // namespace sax } // namespace modxml