summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--meson.build20
-rw-r--r--src/prefix_tree.cc173
-rw-r--r--src/prefix_tree.hh36
-rw-r--r--test/prefix_tree.cc47
4 files changed, 276 insertions, 0 deletions
diff --git a/meson.build b/meson.build
index 99c7ec9..1607088 100644
--- a/meson.build
+++ b/meson.build
@@ -191,6 +191,16 @@ java_uescape_dep = declare_dependency(
dependencies: [buffer_dep, uio_dep],
)
+prefix_tree_lib = library(
+ 'prefix_tree',
+ sources: [
+ 'src/prefix_tree.hh',
+ 'src/prefix_tree.cc',
+ ],
+ include_directories: inc,
+)
+prefix_tree_dep = declare_dependency(link_with: prefix_tree_lib)
+
jkc = executable(
'jkc',
sources: [
@@ -341,6 +351,16 @@ test('java_uescape', executable(
],
))
+test('prefix_tree', executable(
+ 'test_prefix_tree',
+ sources: ['test/prefix_tree.cc'],
+ include_directories: inc,
+ dependencies: [
+ prefix_tree_dep,
+ test_dependencies,
+ ],
+))
+
run_clang_tidy = find_program('run-clang-tidy', required: false)
if run_clang_tidy.found()
diff --git a/src/prefix_tree.cc b/src/prefix_tree.cc
new file mode 100644
index 0000000..f16df22
--- /dev/null
+++ b/src/prefix_tree.cc
@@ -0,0 +1,173 @@
+#include "prefix_tree.hh"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <string_view>
+
+namespace prefix_tree {
+
+namespace {
+
+[[nodiscard]]
+inline uint8_t get_u8(std::string_view str, size_t index) {
+ return static_cast<uint8_t>(str[index] & 0xff);
+}
+
+[[nodiscard]]
+inline uint16_t get_u16(std::string_view str, size_t index) {
+ return (static_cast<uint16_t>(get_u8(str, index)) << 8) |
+ get_u8(str, index + 1);
+}
+
+inline void write_u8(std::string& str, uint8_t value) {
+ str.push_back(static_cast<char>(value));
+}
+
+inline void write_u16(std::string& str, uint16_t value) {
+ write_u8(str, value >> 8);
+ write_u8(str, value & 0xff);
+}
+
+class BuilderImpl : public Builder {
+ public:
+ BuilderImpl() = default;
+
+ void add(std::string_view str) override { strings_.emplace(str); }
+
+ [[nodiscard]]
+ std::optional<std::string> build() const override {
+ std::string tree;
+ std::string strings;
+
+ std::set<std::string_view> tmp;
+ for (auto const& str : strings_)
+ tmp.emplace(str);
+
+ if (!write_tree(tmp, tree, strings))
+ return std::nullopt;
+
+ std::string header;
+ if (tree.size() > 0xffff)
+ return std::nullopt;
+ write_u16(header, tree.size());
+
+ return header + tree + strings;
+ }
+
+ private:
+ bool write_tree(std::set<std::string_view> const& input, std::string& tree,
+ std::string& strings) const {
+ std::map<char, std::set<std::string_view>> buckets;
+ bool match = false;
+ for (auto& str : input) {
+ if (str.empty()) {
+ match = true;
+ continue;
+ }
+ buckets[str.front()].emplace(str.substr(1));
+ }
+
+ write_u8(tree, buckets.size() + (match ? 1 : 0));
+ if (match) {
+ write_u8(tree, 0);
+ write_u16(tree, 0);
+ write_u16(tree, 0);
+ }
+ std::string extra;
+ for (auto& pair : buckets) {
+ auto it = pair.second.begin();
+ auto str = *it;
+ for (++it; it != pair.second.end(); ++it) {
+ size_t i = 0;
+ while (i < str.size() && i < it->size() && str[i] == it->at(i))
+ ++i;
+ if (i == 0) {
+ str = "";
+ break;
+ }
+ str = str.substr(0, i);
+ }
+
+ write_u8(tree, 1 + str.size());
+ if (strings.size() > 0xffff)
+ return false;
+ write_u16(tree, strings.size());
+ strings.push_back(pair.first);
+ strings.append(str);
+ if (extra.size() > 0xffff)
+ return false;
+ write_u16(tree, extra.size());
+ if (str.empty()) {
+ if (!write_tree(pair.second, extra, strings))
+ return false;
+ } else {
+ std::set<std::string_view> tmp;
+ for (auto& str2 : pair.second) {
+ tmp.emplace(str2.substr(str.size()));
+ }
+ if (!write_tree(tmp, extra, strings))
+ return false;
+ }
+ }
+
+ tree.append(extra);
+ return true;
+ }
+
+ std::set<std::string> strings_;
+};
+
+} // namespace
+
+std::optional<size_t> lookup(std::string_view tree, std::string_view str) {
+ size_t base_str = 2 + get_u16(tree, 0);
+ size_t node = 2;
+ std::optional<size_t> match;
+ std::optional<size_t> earlier_match;
+
+ while (node < base_str && !str.empty()) {
+ auto children = get_u8(tree, node);
+
+ if (children == 0) {
+ // Leaf
+ return match;
+ }
+
+ size_t child_node = node + 1;
+ size_t child_end = child_node + (static_cast<size_t>(children) * 5);
+ for (; child_node < child_end; child_node += 5) {
+ uint8_t len = get_u8(tree, child_node);
+ uint16_t offset = get_u16(tree, child_node + 1);
+
+ if (str.starts_with(tree.substr(base_str + offset, len))) {
+ // Match but not a leaf, always first in the list of children
+ if (len == 0) {
+ earlier_match = match;
+ continue;
+ }
+ match = match.value_or(0) + len;
+ str = str.substr(len);
+ auto jump = get_u16(tree, child_node + 3);
+ node = child_end + jump;
+ break;
+ }
+ }
+
+ if (child_node == child_end)
+ return earlier_match;
+ }
+
+ if (node == base_str)
+ return earlier_match;
+ return match;
+}
+
+std::unique_ptr<Builder> builder() { return std::make_unique<BuilderImpl>(); }
+
+} // namespace prefix_tree
diff --git a/src/prefix_tree.hh b/src/prefix_tree.hh
new file mode 100644
index 0000000..6e4c792
--- /dev/null
+++ b/src/prefix_tree.hh
@@ -0,0 +1,36 @@
+#ifndef PREFIX_TREE_HH
+#define PREFIX_TREE_HH
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+
+namespace prefix_tree {
+
+class Builder {
+ public:
+ virtual ~Builder() = default;
+
+ virtual void add(std::string_view str) = 0;
+ [[nodiscard]]
+ virtual std::optional<std::string> build() const = 0;
+
+ protected:
+ Builder() = default;
+
+ Builder(Builder const&) = delete;
+ Builder& operator=(Builder const&) = delete;
+};
+
+[[nodiscard]]
+std::unique_ptr<Builder> builder();
+
+// Returns the length of the prefix of str that exists in tree.
+// Returns nullopt if prefix of str doesn't match any string in tree.
+[[nodiscard]]
+std::optional<size_t> lookup(std::string_view tree, std::string_view str);
+
+} // namespace prefix_tree
+
+#endif // PREFIX_TREE_HH
diff --git a/test/prefix_tree.cc b/test/prefix_tree.cc
new file mode 100644
index 0000000..6c00adb
--- /dev/null
+++ b/test/prefix_tree.cc
@@ -0,0 +1,47 @@
+#include "prefix_tree.hh"
+
+#include <gtest/gtest.h>
+
+TEST(prefix_tree, empty) {
+ auto builder = prefix_tree::builder();
+ auto tree = builder->build();
+ ASSERT_TRUE(tree.has_value());
+ auto ret = prefix_tree::lookup(tree.value(), "");
+ EXPECT_FALSE(ret.has_value());
+ ret = prefix_tree::lookup(tree.value(), "foo");
+ EXPECT_FALSE(ret.has_value());
+}
+
+TEST(prefix_tree, sanity) {
+ auto builder = prefix_tree::builder();
+ builder->add("foo");
+ builder->add("bar");
+ builder->add("foobar");
+ auto tree = builder->build();
+ ASSERT_TRUE(tree.has_value());
+ auto ret = prefix_tree::lookup(tree.value(), "");
+ EXPECT_FALSE(ret.has_value());
+ ret = prefix_tree::lookup(tree.value(), "foo");
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(3, ret.value());
+ ret = prefix_tree::lookup(tree.value(), "foobar");
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(6, ret.value());
+ ret = prefix_tree::lookup(tree.value(), "bar");
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(3, ret.value());
+ ret = prefix_tree::lookup(tree.value(), "ba");
+ EXPECT_FALSE(ret.has_value());
+ ret = prefix_tree::lookup(tree.value(), "barf");
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(3, ret.value());
+ ret = prefix_tree::lookup(tree.value(), "foob");
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(3, ret.value());
+ ret = prefix_tree::lookup(tree.value(), "foozum");
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(3, ret.value());
+ ret = prefix_tree::lookup(tree.value(), "barfoo");
+ ASSERT_TRUE(ret.has_value());
+ EXPECT_EQ(3, ret.value());
+}