From 6300a2b364015cb6923557802aeb3daedb691f8c Mon Sep 17 00:00:00 2001 From: Horst Schirmeier Date: Tue, 29 Apr 2014 18:06:08 +0200 Subject: [PATCH] util: SumTree implementation The SumTree implements an efficient tree data structure for "roulette-wheel" sampling, or "sampling with fault expansion", i.e., sampling of trace entries / pilots without replacement and with a picking probability proportional to the entries' sizes. For every sample, the naive approach picks a random number between 0 and the sum of all entry sizes minus one. It then iterates over all entries and sums their sizes until the sum exceeds the random number. The current entry gets picked. The main disadvantage is the linear complexity, which gets unpleasant for millions of entries. The core idea behind the SumTree implementation is to maintain the size sum of groups of entries, kept in "buckets". Thereby, a bucket can be quickly jumped over. To keep bucket sizes (and thereby linear search times) bounded, more bucket hierarchy levels are introduced when a defined bucket size limit is reached. Note that the current implementation is built for a pure growth phase (when the tree gets filled with pilots from the database), followed by a sampling phase when the tree gets emptied. It does not handle a mixed add/remove case very smartly, although it should remain functional. Change-Id: If05e9700bc84761b5bc31006402641e7112b3a72 --- src/core/util/CMakeLists.txt | 3 + src/core/util/SumTree.hpp | 189 +++++++++++++++++++++++++++ src/core/util/testing/SumTreeTest.cc | 34 +++++ 3 files changed, 226 insertions(+) create mode 100644 src/core/util/SumTree.hpp create mode 100644 src/core/util/testing/SumTreeTest.cc diff --git a/src/core/util/CMakeLists.txt b/src/core/util/CMakeLists.txt index 06052098..b8cd483d 100644 --- a/src/core/util/CMakeLists.txt +++ b/src/core/util/CMakeLists.txt @@ -89,3 +89,6 @@ endif (BUILD_LLVM_DISASSEMBLER) add_executable(memorymap-test testing/memorymap-test.cc) target_link_libraries(memorymap-test fail-util) add_test(NAME memorymap-test WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testing COMMAND memorymap-test) + +add_executable(sumtree-test testing/SumTreeTest.cc) +add_test(NAME sumtree-test WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testing COMMAND sumtree-test) diff --git a/src/core/util/SumTree.hpp b/src/core/util/SumTree.hpp new file mode 100644 index 00000000..998e3e05 --- /dev/null +++ b/src/core/util/SumTree.hpp @@ -0,0 +1,189 @@ +#ifndef __SUM_TREE_HPP__ +#define __SUM_TREE_HPP__ + +#include +#include +#include + +// The SumTree implements an efficient tree data structure for +// "roulette-wheel" sampling, or "sampling with fault expansion", i.e., +// sampling of trace entries / pilots without replacement and with a +// picking probability proportional to the entries' sizes. +// +// For every sample, the naive approach picks a random number between 0 +// and the sum of all entry sizes minus one. It then iterates over all +// entries and sums their sizes until the sum exceeds the random number. +// The current entry gets picked. The main disadvantage is the linear +// complexity, which gets unpleasant for millions of entries. +// +// The core idea behind the SumTree implementation is to maintain the +// size sum of groups of entries, kept in "buckets". Thereby, a bucket +// can be quickly jumped over. To keep bucket sizes (and thereby linear +// search times) bounded, more bucket hierarchy levels are introduced +// when a defined bucket size limit is reached. +// +// Note that the current implementation is built for a pure growth phase +// (when the tree gets filled with pilots from the database), followed by +// a sampling phase when the tree gets emptied. It does not handle a +// mixed add/remove case very smartly, although it should remain +// functional. + +namespace fail { + +template +class SumTree { + //! Bucket data structure for tree nodes + struct Bucket { + Bucket() : size(0) {} + ~Bucket(); + //! Sum of all children / elements + typename T::size_type size; + //! Sub-buckets, empty for leaf nodes + std::vector children; + //! Contained elements, empty for inner nodes + std::vector elements; + }; + + //! Root node + Bucket *m_root; + //! Tree depth: nodes at level m_depth are leaf nodes, others are inner nodes + unsigned m_depth; +public: + SumTree() : m_root(new Bucket), m_depth(0) {} + ~SumTree() { delete m_root; } + //! Adds a new element to the tree. + void add(const T& element); + //! Retrieves (and removes) element at random number position. + T get(typename T::size_type pos) { return get(pos, m_root, 0); } + //! Yields the sum over all elements in the tree. + typename T::size_type get_size() const { return m_root->size; } +private: + //! Internal, recursive version of add(). + bool add(Bucket **node, const T& element, unsigned depth_remaining); + //! Internal, recursive version of get(). + T get(typename T::size_type pos, Bucket *node, typename T::size_type sum); +}; + +// template implementation + +template +SumTree::Bucket::~Bucket() +{ + for (typename std::vector::const_iterator it = children.begin(); + it != children.end(); ++it) { + delete *it; + } +} + +template +void SumTree::add(const T& element) +{ + if (element.size() == 0) { + // pilots with size == 0 cannot be picked anyways + return; + } + + if (add(&m_root, element, m_depth)) { + // tree wasn't full yet, add succeeded + return; + } + + // tree is full, move everything one level down + ++m_depth; + Bucket *b = new Bucket; + b->children.push_back(m_root); + b->size = m_root->size; + m_root = b; + + // retry + add(&m_root, element, m_depth); +} + +template +bool SumTree::add(Bucket **node, const T& element, unsigned depth_remaining) +{ + // non-leaf node? + if (depth_remaining) { + // no children yet? create one. + if ((*node)->children.size() == 0) { + (*node)->children.push_back(new Bucket); + } + + // adding to newest child worked? + if (add(&(*node)->children.back(), element, depth_remaining - 1)) { + (*node)->size += element.size(); + return true; + } + + // newest child full, may we create another one? + if ((*node)->children.size() < BUCKETSIZE) { + (*node)->children.push_back(new Bucket); + add(&(*node)->children.back(), element, depth_remaining - 1); + (*node)->size += element.size(); + return true; + } + // recursive add ultimately failed, subtree full + return false; + + // leaf node + } else { + if ((*node)->elements.size() < BUCKETSIZE) { + (*node)->elements.push_back(element); + (*node)->size += element.size(); + return true; + } + return false; + } +} + +template +T SumTree::get(typename T::size_type pos, Bucket *node, typename T::size_type sum) +{ + // sanity check + assert(pos >= sum && pos < sum + node->size); + + // will only be entered for inner nodes + for (typename std::vector::iterator it = node->children.begin(); + it != node->children.end(); ) { + sum += (*it)->size; + if (sum <= pos) { + ++it; + continue; + } + + // found containing bucket, recurse + sum -= (*it)->size; + T e = get(pos, *it, sum); + node->size -= e.size(); + // remove empty (or, at least, zero-sized) child? + if ((*it)->size == 0) { + delete *it; + node->children.erase(it); + } + return e; + } + + // will only be entered for leaf nodes + for (typename std::vector::iterator it = node->elements.begin(); + it != node->elements.end(); ) { + sum += it->size(); + if (sum <= pos) { + ++it; + continue; + } + + // found pilot + T e = *it; + node->size -= e.size(); + node->elements.erase(it); + return e; + } + + // this should never happen + assert(0); + return T(); +} + +} // namespace + +#endif diff --git a/src/core/util/testing/SumTreeTest.cc b/src/core/util/testing/SumTreeTest.cc new file mode 100644 index 00000000..1757cd17 --- /dev/null +++ b/src/core/util/testing/SumTreeTest.cc @@ -0,0 +1,34 @@ +#include "util/SumTree.hpp" + +#include +#define LOG std::cerr + +using std::endl; + +struct Pilot { + uint32_t id; + uint32_t instr2; + uint32_t data_address; + uint64_t duration; + + typedef uint64_t size_type; + size_type size() const { return duration; } +}; + +int main() +{ + fail::SumTree tree; + for (int i = 0; i <= 20; ++i) { + Pilot p; + p.duration = i; + tree.add(p); + } + + while (tree.get_size() > 0) { + uint64_t pos = tree.get_size() / 2; + LOG << "MAIN tree.get_size() = " << tree.get_size() + << ", trying to retrieve pos = " << pos << endl; + Pilot p = tree.get(pos); + LOG << "MAIN retrieved pilot with duration " << p.duration << endl; + } +}