From 4dddfd622977f84f0cf41847aec9e728d02bec65 Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Mon, 20 Oct 2025 22:01:05 +0200 Subject: uri: New module Decode URI encoded string, validating both hex and that encoded data is valid UTF-8. --- src/uri.cc | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 src/uri.cc (limited to 'src/uri.cc') diff --git a/src/uri.cc b/src/uri.cc new file mode 100644 index 0000000..b7a3edf --- /dev/null +++ b/src/uri.cc @@ -0,0 +1,70 @@ +#include "uri.hh" + +#include "u8.hh" + +#include +#include +#include +#include +#include + +namespace uri { + +namespace { + +inline std::optional hex(char c) { + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'A' && c <= 'F') + return 10 + (c - 'A'); + if (c >= 'a' && c <= 'f') + return 10 + (c - 'a'); + return std::nullopt; +} + +} // namespace + +std::optional decode(std::string_view input, + std::string& dst) { + auto i = input.find('%'); + if (i == std::string_view::npos) + return input; + + dst.clear(); + size_t last = 0; + bool check_utf8 = false; + while (true) { + if (input.size() - i < 3) + return std::nullopt; + auto a = hex(input[i + 1]); + auto b = hex(input[i + 2]); + if (!a.has_value() || !b.has_value()) + return std::nullopt; + dst.append(input, last, i - last); + auto c = (a.value() << 4) | b.value(); + if (c & 0x80) + check_utf8 = true; + dst.push_back(static_cast(c)); + last = i + 3; + i = input.find('%', last); + if (i == std::string::npos) { + dst.append(input, last); + break; + } + } + + if (check_utf8) { + std::span data{reinterpret_cast(dst.data()), + dst.size()}; + auto it = data.begin(); + while (it != data.end()) { + auto ret = u8::read(it, data.end()); + if (!ret.has_value()) + return std::nullopt; + } + } + + return dst; +} + +} // namespace uri -- cgit v1.2.3-70-g09d2