#include "args.hh" #include "csv.hh" #include "decompress.hh" #include "ugc.hh" #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace { std::map> str2gc{ {"Lu", u::GeneralCategory::LETTER_UPPERCASE}, {"Ll", u::GeneralCategory::LETTER_LOWERCASE}, {"Lt", u::GeneralCategory::LETTER_TITLECASE}, {"Lm", u::GeneralCategory::LETTER_MODIFIER}, {"Lo", u::GeneralCategory::LETTER_OTHER}, {"Mn", u::GeneralCategory::MARK_NONSPACING}, {"Mc", u::GeneralCategory::MARK_SPACING_COMBINING}, {"Me", u::GeneralCategory::MARK_SPACING_ENCLOSING}, {"Nd", u::GeneralCategory::NUMBER_DIGIT}, {"Nl", u::GeneralCategory::NUMBER_LETTER}, {"No", u::GeneralCategory::NUMBER_OTHER}, {"Pc", u::GeneralCategory::PUNCTUATION_CONNECTOR}, {"Pd", u::GeneralCategory::PUNCTUATION_DASH}, {"Ps", u::GeneralCategory::PUNCTUATION_OPEN}, {"Pe", u::GeneralCategory::PUNCTUATION_CLOSE}, {"Pi", u::GeneralCategory::PUNCTUATION_INITIAL_QUOTE}, {"Pf", u::GeneralCategory::PUNCTUATION_FINAL_QUOTE}, {"Po", u::GeneralCategory::PUNCTUATION_OTHER}, {"Sm", u::GeneralCategory::SYMBOL_MATH}, {"Sc", u::GeneralCategory::SYMBOL_CURRENCY}, {"Sk", u::GeneralCategory::SYMBOL_MODIFIER}, {"So", u::GeneralCategory::SYMBOL_OTHER}, {"Zs", u::GeneralCategory::SEPARATOR_SPACE}, {"Zl", u::GeneralCategory::SEPARATOR_LINE}, {"Zp", u::GeneralCategory::SEPARATOR_PARAGRAPH}, {"Cc", u::GeneralCategory::OTHER_CONTROL}, {"Cf", u::GeneralCategory::OTHER_FORMAT}, {"Cs", u::GeneralCategory::OTHER_SURROGATE}, {"Co", u::GeneralCategory::OTHER_PRIVATE_USE}, {"Cn", u::GeneralCategory::OTHER_UNASSIGNED}, }; void print_header(std::ostream& out, std::string_view prefix) { out << "#include \"ugc.hh\"\n" << "\n" << "#include \n" << "#include \n" << "#include \n" << "\n" << "namespace u {\n" << "\n" << "extern GeneralCategory " << prefix << "lookup_gc(uint32_t code) {\n"; } void print_body(std::ostream& out, std::map const& data) { std::vector codes; std::vector categories; auto it = data.begin(); codes.emplace_back(it->first); categories.emplace_back(it->second); uint32_t next = it->first + 1; for (++it; it != data.end(); ++it) { if (it->first == next && categories.back() == it->second) { ++next; } else { codes.emplace_back(next - 1); codes.emplace_back(it->first); categories.emplace_back(it->second); next = it->first + 1; } } codes.emplace_back(next - 1); out << " static std::array codes{"; for (auto code : codes) { out << code << ","; } out << " };\n"; out << " static std::array categories{"; for (auto category : categories) { out << static_cast(category) << ","; } out << "};\n"; out << " size_t low = 0;\n" << " size_t high = " << (codes.size() / 2) << ";\n" << " while (low < high) {\n" << " size_t m = (low + high) / 2;\n" << " uint32_t start = codes[m * 2];\n" << " if (code < start) {\n" << " high = m;\n" << " } else {\n" << " uint32_t end = codes[(m * 2) + 1];\n" << " if (code <= end) {\n" << " return static_cast(categories[m]);\n" << " }\n" << " low = m + 1;\n" << " }\n" << " }\n" << " return u::GeneralCategory::OTHER_UNASSIGNED;\n"; } void print_footer(std::ostream& out, std::string_view /* prefix */) { out << "}\n" << "\n" << "} // namespace u\n"; } std::string_view ioerr2str(io::OpenError error) { switch (error) { case io::OpenError::NoSuchFile: return "No such file"; case io::OpenError::NoAccess: return "No access"; case io::OpenError::Error: return "Fatal error"; } std::unreachable(); } std::string_view ioerr2str(io::ReadError error) { switch (error) { case io::ReadError::InvalidData: return "Invalid (compressed) data"; case io::ReadError::Error: return "Fatal error"; case io::ReadError::MaxTooSmall: return "Too small buffer"; case io::ReadError::Eof: return "Unexpected end of file"; } std::unreachable(); } std::expected, std::string> parse_row( std::span row) { // [code];[name];[gc];[cc];[bc];[decomposition];[nv-dec];[nv-dig];[nv-num];[bm];[alias];;[upper case];[lower case];[title case] if (row.size() != 15) { return std::unexpected(std::format("Invalid row ({} columns)", row.size())); } auto code_col = row[0]; auto category_col = row[2]; uint32_t code; auto [ptr, ec] = std::from_chars(code_col.data(), code_col.data() + code_col.size(), code, /* base */ 16); if (ec != std::errc() || ptr != code_col.data() + code_col.size()) { return std::unexpected(std::format("Invalid code value {}", code_col)); } u::GeneralCategory category; auto it = str2gc.find(category_col); if (it == str2gc.end()) { return std::unexpected( std::format("Invalid general category {}", category_col)); } category = it->second; return std::make_pair(code, category); } std::expected, std::string> read( std::string_view filename) { auto maybe_reader = io::open(std::string(filename)); if (!maybe_reader.has_value()) { return std::unexpected(std::format("Unable to open {} for reading: {}", filename, ioerr2str(maybe_reader.error()))); } auto reader = std::move(maybe_reader.value()); if (filename.ends_with(".gz")) { reader = decompress::gzip(std::move(reader)); } else if (filename.ends_with(".xz")) { reader = decompress::xz(std::move(reader)); } std::map ret; auto csv_reader = csv::open(std::move(reader), ';'); while (true) { auto row = csv_reader->read(); if (!row.has_value()) { return std::unexpected(std::format("{}:{}: Error reading file: {}", filename, csv_reader->number(), ioerr2str(row.error()))); } if (row->empty()) break; auto pair = parse_row(row.value()); if (!pair.has_value()) { return std::unexpected(std::format("{}:{}: {}", filename, csv_reader->number(), pair.error())); } auto name_col = (*row)[1]; if (name_col.ends_with(", First>")) { std::string prefix(name_col.substr(0, name_col.size() - 8)); row = csv_reader->read(); if (!row.has_value()) { return std::unexpected(std::format("{}:{}: Error reading file: {}", filename, csv_reader->number(), ioerr2str(row.error()))); } auto second_pair = parse_row(row.value()); if (!pair.has_value()) { return std::unexpected(std::format("{}:{}: {}", filename, csv_reader->number(), pair.error())); } name_col = (*row)[1]; if (name_col.ends_with(", Last>") && name_col.substr(0, name_col.size() - 7) == prefix) { if (pair->second != second_pair->second) { return std::unexpected(std::format( "{}:{}: Invalid range, general category doesn't match", filename, csv_reader->number())); } for (uint32_t c = pair->first; c <= second_pair->first; ++c) { auto emplace_ret = ret.emplace(c, pair->second); if (!emplace_ret.second) { return std::unexpected( std::format("{}:{}: Duplicate value for {:#08x}", filename, csv_reader->number(), c)); } } } else { return std::unexpected( std::format("{}:{}: Invalid range, {} doesn't match {}", filename, csv_reader->number(), prefix, name_col)); } } else { auto emplace_ret = ret.emplace(std::move(pair.value())); if (!emplace_ret.second) { return std::unexpected(std::format("{}:{}: Duplicate value for {:#08x}", filename, csv_reader->number(), emplace_ret.first->first)); } } } return ret; } } // namespace int main(int argc, char** argv) { auto args = Args::create(); auto opt_help = args->option('h', "help", "display this text and exit"); auto opt_prefix = args->option_argument('p', "prefix", "ARG", "Prefix for exported method"); std::vector arguments; if (!args->run(argc, argv, &arguments)) { args->print_error(std::cerr); std::cerr << "Try `gen_u --help` for usage\n"; return 1; } if (opt_help->is_set()) { std::cout << "Usage: `gen_u [OPTIONS...] UnicodeData [OUTPUT]`\n" << "Generates a method for getting the general category for a " << "code point.\n" << "\n"; args->print_help(std::cout); return 0; } if (!opt_prefix->is_set()) { std::cerr << "No prefix given.\n" << "Try `gen_u --help` for usage\n"; return 1; } auto prefix = opt_prefix->argument(); if (arguments.empty() || arguments.size() > 2) { std::cerr << "Expecting one or two argument. No more, no less.\n" << "Try `gen_u --help` for usage\n"; return 1; } auto general_categories = read(arguments[0]); if (!general_categories.has_value()) { std::cerr << general_categories.error() << '\n'; return 1; } if (arguments.size() < 2 || arguments[1] == "-") { print_header(std::cout, prefix); print_body(std::cout, general_categories.value()); print_footer(std::cout, prefix); } else { std::fstream out{std::string(arguments[1]), std::fstream::trunc | std::fstream::out}; print_header(out, prefix); print_body(out, general_categories.value()); print_footer(out, prefix); } return 0; }