diff --git a/CMakeLists.txt b/CMakeLists.txt index 9c93ef5..112b2b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,6 +35,7 @@ set(SOURCES # Libraries include(FetchContent) find_package(raylib REQUIRED) +find_package(libmorton REQUIRED) find_package(Boost COMPONENTS program_options REQUIRED) set(LIBS raylib Boost::headers Boost::program_options) set(FLAGS "") @@ -52,6 +53,8 @@ if(NOT DISABLE_BACKWARD) list(APPEND LIBS Backward::Backward) list(APPEND FLAGS BACKWARD) + + message("-- BACKWARD: Enabled") endif() if(NOT DISABLE_TRACY) @@ -67,6 +70,8 @@ if(NOT DISABLE_TRACY) list(APPEND LIBS TracyClient) list(APPEND FLAGS TRACY) + + message("-- TRACY: Enabled") endif() # Set this after fetching tracy to hide tracy's warnings. @@ -111,6 +116,8 @@ if(NOT DISABLE_TESTS AND NOT WIN32) include(Catch) catch_discover_tests(tests) + + message("-- TESTS: Enabled") endif() # Benchmarking @@ -124,14 +131,16 @@ if(NOT DISABLE_BENCH AND NOT WIN32) add_executable(benchmarks ${BENCH_SOURCES} ${SOURCES}) target_include_directories(benchmarks PRIVATE include) target_link_libraries(benchmarks benchmark raylib) + + message("-- BENCHMARKS: Enabled") endif() # LTO include(CheckIPOSupported) check_ipo_supported(RESULT supported OUTPUT error) if(supported) - message(STATUS "IPO / LTO enabled") + message("-- IPO/LTO: Enabled") set_property(TARGET masssprings PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) else() - message(STATUS "IPO / LTO not supported") + message("-- IPO/LTO: Disabled") endif() \ No newline at end of file diff --git a/flake.nix b/flake.nix index 8f5fd15..01968a1 100644 --- a/flake.nix +++ b/flake.nix @@ -135,7 +135,7 @@ rec { # Define custom dependencies # =========================================================================================== - raygui = stdenv.mkDerivation (finalAttrs: { + raygui = stdenv.mkDerivation rec { pname = "raygui"; version = "4.0-unstable-2026-02-24"; @@ -163,13 +163,13 @@ rec { Name: raygui Description: Simple and easy-to-use immediate-mode gui library URL: https://github.com/raysan5/raygui - Version: ${finalAttrs.version} + Version: ${version} Cflags: -I"{includedir}" EOF runHook postInstall ''; - }); + }; thread-pool = stdenv.mkDerivation { pname = "thread-pool"; @@ -202,11 +202,19 @@ rec { }; # Header-only library - dontBuild = true; - installPhase = '' - mkdir -p $out - mv ./include $out/include - ''; + # dontBuild = true; + # installPhase = '' + # mkdir -p $out + # mv ./include $out/include + # ''; + + nativeBuildInputs = with pkgs; [cmake]; + + cmakeFlags = [ + "-DBUILD_TESTING=OFF" + "-DCMAKE_INSTALL_INCLUDEDIR=include" + "-DCMAKE_INSTALL_DATADIR=share" + ]; }; # =========================================================================================== diff --git a/include/octree.hpp b/include/octree.hpp index be48c1d..3d39a46 100644 --- a/include/octree.hpp +++ b/include/octree.hpp @@ -8,6 +8,7 @@ #include #include +#include class octree { @@ -16,24 +17,21 @@ class octree public: Vector3 mass_center = Vector3Zero(); float mass_total = 0.0; - Vector3 box_min = Vector3Zero(); // area start - Vector3 box_max = Vector3Zero(); // area end + uint8_t depth = 0; + float size = 0.0f; // Because our octree cells are cubic we don't need to store the bounds std::array children = {-1, -1, -1, -1, -1, -1, -1, -1}; int mass_id = -1; bool leaf = true; - - public: - node(const Vector3& _box_min, const Vector3& _box_max) : box_min(_box_min), box_max(_box_max) {} }; -public: - static constexpr int MAX_DEPTH = 20; +private: + // 21 * 3 = 63, fits in uint64_t for combined x/y/z morton-code + static constexpr int MAX_DEPTH = 21; std::vector nodes; // This approach is actually slower than the array of nodes // beacuse we access all the attributes in the same function - // std::vector mass_centers; // std::vector mass_totals; // std::vector box_mins; @@ -50,22 +48,58 @@ public: octree(octree&& move) = delete; auto operator=(octree&& move) -> octree& = delete; -public: - auto clear() -> void; - auto reserve(size_t count) -> void; - [[nodiscard]] auto empty() const -> bool; +private: [[nodiscard]] INLINE static inline auto get_octant(const Vector3& box_min, const Vector3& box_max, const Vector3& pos) -> int; [[nodiscard]] INLINE static inline auto get_child_bounds(const Vector3& box_min, const Vector3& box_max, int octant) -> std::pair; - INLINE inline auto create_empty_leaf(const Vector3& box_min, const Vector3& box_max) -> int; - [[nodiscard]] auto get_child_count(int node_idx) const -> int; - auto insert(int node_idx, int mass_id, const Vector3& pos, float mass, int depth) -> void; - static auto build_octree(octree& t, const std::vector& positions) -> void; - [[nodiscard]] auto calculate_force(int node_idx, const Vector3& pos) const -> Vector3; + // Map a floating point coordinate to a discrete integer (so its morton-code can be computed) + // The "bits" parameter determines the discrete axis resolution + [[nodiscard]] INLINE static inline auto quantize_axis(float coordinate, + float box_min, + float box_max, + int bits) -> uint32_t; + + [[nodiscard]] INLINE static inline auto pos_to_morton(const Vector3& p, + const Vector3& root_min, + const Vector3& root_max) -> uint64_t; + + [[nodiscard]] INLINE static inline auto jitter_pos(Vector3 p, + uint32_t seed, + const Vector3& root_min, + const Vector3& root_max, + float root_extent) -> Vector3; + + // Use this to obtain an ancestor node of a leaf node (on any level). + // Because the morton codes (interleaved coordinates) encode the octree path, we can take + // the morten code of any leaf and only take the 3*n first interleaved bits to find the + // leaf ancestor on level n. + // Leaf Code: [101 110 100 001] -> Ancestors (from leaf to root): + // - [101 110 100] + // - [101 110] + // - [101] (root) + [[nodiscard]] INLINE static inline auto path_to_ancestor(uint64_t leaf_code, int leaf_depth, int depth) -> uint64_t; + + // Use this to obtain the octant a leaf node is contained in (on any level). + // The 3 interleaved bits in the morten code encode the octant [0, 7]. + // Leaf Code: [101 110 100 001] -> Octants: + // - [100] (Level 2) + // - [110] (Level 1) + // - [101] (Level 0) + [[nodiscard]] INLINE static inline auto octant_at_level(uint64_t leaf_code, int level, int leaf_depth) -> int; + +public: + auto clear() -> void; + auto reserve(size_t count) -> void; + [[nodiscard]] auto empty() const -> bool; + [[nodiscard]] auto root() const -> const node&; + + // Morton/linear octree implementation + static auto build_octree_morton(octree& t, const std::vector& positions) -> void; + [[nodiscard]] auto calculate_force_morton(int node_idx, const Vector3& pos, int self_id) const -> Vector3; }; INLINE inline auto octree::get_octant(const Vector3& box_min, const Vector3& box_max, const Vector3& pos) -> int @@ -101,18 +135,75 @@ INLINE inline auto octree::get_child_bounds(const Vector3& box_min, return std::make_pair(min, max); } -INLINE inline auto octree::create_empty_leaf(const Vector3& box_min, const Vector3& box_max) -> int +INLINE inline auto octree::quantize_axis(const float coordinate, + const float box_min, + const float box_max, + const int bits) -> uint32_t { - nodes.emplace_back(box_min, box_max); - return static_cast(nodes.size() - 1); + const float extent = box_max - box_min; + if (extent <= 0.0f) { + return 0; + } - // mass_centers.emplace_back(Vector3Zero()); - // mass_totals.emplace_back(0); - // box_mins.emplace_back(box_min); - // box_maxs.emplace_back(box_max); - // childrens.push_back({-1, -1, -1, -1, -1, -1, -1, -1}); - // mass_ids.emplace_back(-1); - // leafs.emplace_back(true); + float normalized = (coordinate - box_min) / extent; // normalize to [0,1] + normalized = std::max(0.0f, std::min(normalized, std::nextafter(1.0f, 0.0f))); // avoid exactly 1.0 + + // bits up to 21 => (1u << bits) safe in 32-bit + const uint32_t grid_max = (1u << bits) - 1u; + return static_cast(normalized * static_cast(grid_max)); +} + +INLINE inline auto octree::pos_to_morton(const Vector3& p, const Vector3& root_min, const Vector3& root_max) -> uint64_t +{ + const uint32_t x = quantize_axis(p.x, root_min.x, root_max.x, MAX_DEPTH); + const uint32_t y = quantize_axis(p.y, root_min.y, root_max.y, MAX_DEPTH); + const uint32_t z = quantize_axis(p.z, root_min.z, root_max.z, MAX_DEPTH); + return libmorton::morton3D_64_encode(x, y, z); +} + +INLINE inline auto octree::jitter_pos(Vector3 p, + const uint32_t seed, + const Vector3& root_min, + const Vector3& root_max, + const float root_extent) -> Vector3 +{ + // Use a hash to calculate a deterministic jitter: The same position should always get the same jitter. + // We want this to get stable physics, particles at the same position shouldn't get different jitters + // across frames... + uint32_t h = (seed ^ 61u) ^ (seed >> 16); + h *= 9u; + h = h ^ (h >> 4); + h *= 0x27d4eb2du; + h = h ^ (h >> 15); + + // finest cell size at depth L + const float finest_cell = root_extent / static_cast(1u << MAX_DEPTH); + const float s = finest_cell * 1e-4f; // small pp + + p.x += (h & 1u) ? +s : -s; + p.y += (h & 2u) ? +s : -s; + p.z += (h & 4u) ? +s : -s; + + // clamp back into bounds just in case + p.x = std::max(root_min.x, std::min(p.x, root_max.x)); + p.y = std::max(root_min.y, std::min(p.y, root_max.y)); + p.z = std::max(root_min.z, std::min(p.z, root_max.z)); + + return p; +} + +INLINE inline auto octree::path_to_ancestor(const uint64_t leaf_code, const int leaf_depth, const int depth) -> uint64_t +{ + // keep top 3*depth bits; drop the rest + const int drop = 3 * (leaf_depth - depth); + return (drop > 0) ? (leaf_code >> drop) : leaf_code; +} + +INLINE inline auto octree::octant_at_level(const uint64_t leaf_code, const int level, const int leaf_depth) -> int +{ + // level 1 => child of root => topmost 3 bits + const int shift = 3 * (leaf_depth - level); + return static_cast((leaf_code >> shift) & 0x7ull); } #endif \ No newline at end of file diff --git a/include/util.hpp b/include/util.hpp index 6bcf2ec..f80c00e 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -7,6 +7,10 @@ #define INLINE __attribute__((always_inline)) #define PACKED __attribute__((packed)) +#define STARTTIME const auto start = std::chrono::high_resolution_clock::now() +#define ENDTIME(msg) const auto end = std::chrono::high_resolution_clock::now(); \ + infoln("{}. Took {}ms.", msg, std::chrono::duration_cast(end - start).count()) + // std::variant visitor // https://en.cppreference.com/w/cpp/utility/variant/visit @@ -92,28 +96,32 @@ template auto traceln(std::format_string fmt, Args&&... args) -> void { std::cout << std::format("[{}TRACE{}]: ", ansi_bold_fg(fg::cyan), ansi_reset()) << std::format( - fmt, std::forward(args)...) << std::endl; + fmt, + std::forward(args)...) << std::endl; } template auto infoln(std::format_string fmt, Args&&... args) -> void { std::cout << std::format("[{}INFO{}]: ", ansi_bold_fg(fg::blue), ansi_reset()) << std::format( - fmt, std::forward(args)...) << std::endl; + fmt, + std::forward(args)...) << std::endl; } template auto warnln(std::format_string fmt, Args&&... args) -> void { std::cout << std::format("[{}WARNING{}]: ", ansi_bold_fg(fg::yellow), ansi_reset()) << std::format( - fmt, std::forward(args)...) << std::endl; + fmt, + std::forward(args)...) << std::endl; } template auto errln(std::format_string fmt, Args&&... args) -> void { std::cout << std::format("[{}ERROR{}]: ", ansi_bold_fg(fg::red), ansi_reset()) << std::format( - fmt, std::forward(args)...) << std::endl; + fmt, + std::forward(args)...) << std::endl; } #endif \ No newline at end of file diff --git a/src/cpu_layout_engine.cpp b/src/cpu_layout_engine.cpp index e37beae..6a24040 100644 --- a/src/cpu_layout_engine.cpp +++ b/src/cpu_layout_engine.cpp @@ -103,7 +103,7 @@ auto cpu_layout_engine::physics_thread(physics_state& state, const std::optional last_mass_count = mass_springs.positions.size(); } #else - octree::build_octree(mass_springs.tree, mass_springs.positions); + octree::build_octree_morton(mass_springs.tree, mass_springs.positions); #endif mass_springs.clear_forces(); @@ -157,7 +157,7 @@ auto cpu_layout_engine::physics_thread(physics_state& state, const std::optional if (mass_springs.tree.empty()) { state.mass_center = Vector3Zero(); } else { - state.mass_center = mass_springs.tree.nodes[0].mass_center; + state.mass_center = mass_springs.tree.root().mass_center; } state.masses.clear(); diff --git a/src/cpu_spring_system.cpp b/src/cpu_spring_system.cpp index fa8dd15..4387a8a 100644 --- a/src/cpu_spring_system.cpp +++ b/src/cpu_spring_system.cpp @@ -45,7 +45,7 @@ auto cpu_spring_system::add_spring(size_t a, size_t b) -> void // If the offset moves the mass closer to the current center of mass, flip it if (!tree.empty()) { const Vector3 mass_center_direction = - Vector3Subtract(positions[a], tree.nodes[0].mass_center); + Vector3Subtract(positions[a], tree.root().mass_center); const float mass_center_distance = Vector3Length(mass_center_direction); if (mass_center_distance > 0 && Vector3DotProduct(offset, mass_center_direction) < 0.0f) { @@ -126,7 +126,7 @@ auto cpu_spring_system::calculate_repulsion_forces( const auto solve_octree = [&](const int i) { - const Vector3 force = tree.calculate_force(0, positions[i]); + const Vector3 force = tree.calculate_force_morton(0, positions[i], i); forces[i] += force; }; diff --git a/src/main.cpp b/src/main.cpp index 63d9da7..5ace2c3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,7 +11,7 @@ #include -#ifndef WIN32 +#if not defined(_WIN32) #include namespace po = boost::program_options; #endif @@ -235,7 +235,7 @@ enum class runmode auto argparse(const int argc, char* argv[]) -> runmode { - #ifndef WIN32 + #if not defined(_WIN32) po::options_description desc("Allowed options"); desc.add_options() // ("help", "produce help message") // diff --git a/src/octree.cpp b/src/octree.cpp index 0c60ca7..de33648 100644 --- a/src/octree.cpp +++ b/src/octree.cpp @@ -7,205 +7,316 @@ auto octree::clear() -> void { nodes.clear(); - - // mass_centers.clear(); - // mass_totals.clear(); - // box_mins.clear(); - // box_maxs.clear(); - // childrens.clear(); - // mass_ids.clear(); - // leafs.clear(); } auto octree::reserve(const size_t count) -> void { nodes.reserve(count); - - // mass_centers.reserve(count); - // mass_totals.reserve(count); - // box_mins.reserve(count); - // box_maxs.reserve(count); - // childrens.reserve(count); - // mass_ids.reserve(count); - // leafs.reserve(count); } auto octree::empty() const -> bool { return nodes.empty(); - - // return mass_centers.empty(); } -auto octree::get_child_count(const int node_idx) const -> int +auto octree::root() const -> const node& { - int child_count = 0; - for (const int child : nodes[node_idx].children) { - if (child != -1) { - ++child_count; - } - } - return child_count; + return nodes[0]; } -auto octree::insert(const int node_idx, - const int mass_id, - const Vector3& pos, - const float mass, - const int depth) -> void -{ - if (depth > MAX_DEPTH) { - throw std::runtime_error("Octree insertion reachead max depth"); - } - - // NOTE: Do not store a nodes[node_idx] reference as the nodes vector might reallocate during - // this function - - // We can place the particle in the empty leaf - const bool leaf = nodes[node_idx].leaf; - if (leaf && nodes[node_idx].mass_id == -1) { - nodes[node_idx].mass_id = mass_id; - nodes[node_idx].mass_center = pos; - nodes[node_idx].mass_total = mass; - return; - } - - const Vector3& box_min = nodes[node_idx].box_min; - const Vector3& box_max = nodes[node_idx].box_max; - - // The leaf is occupied, we need to subdivide - if (leaf) { - const int existing_id = nodes[node_idx].mass_id; - const Vector3 existing_pos = nodes[node_idx].mass_center; - const float existing_mass = nodes[node_idx].mass_total; - - // If positions are identical we jitter the particles - const Vector3 diff = Vector3Subtract(pos, existing_pos); - if (diff == Vector3Zero()) { - // warnln("Trying to insert an identical partical into octree (jittering position)"); - - Vector3 jittered = pos; - jittered.x += 0.001; - jittered.y += 0.001; - insert(node_idx, mass_id, jittered, mass, depth); - return; - - // Could also merge them, but that leads to the octree having less leafs than we have - // masses nodes[node_idx].mass_total += mass; return; - } - - // Convert the leaf to an internal node - nodes[node_idx].mass_id = -1; - nodes[node_idx].leaf = false; - nodes[node_idx].mass_center = Vector3Zero(); - nodes[node_idx].mass_total = 0.0; - - // Re-insert the existing mass into a new empty leaf (see above) - const int oct = get_octant(box_min, box_max, existing_pos); - if (nodes[node_idx].children[oct] == -1) { - const auto& [min, max] = get_child_bounds(box_min, box_max, oct); - const int child_idx = create_empty_leaf(min, max); - nodes[node_idx].children[oct] = child_idx; - } - insert(nodes[node_idx].children[oct], existing_id, existing_pos, existing_mass, depth + 1); - } - - // Insert the new mass - const int oct = get_octant(box_min, box_max, pos); - if (nodes[node_idx].children[oct] == -1) { - const auto& [min, max] = get_child_bounds(box_min, box_max, oct); - const int child_idx = create_empty_leaf(min, max); - nodes[node_idx].children[oct] = child_idx; - } - insert(nodes[node_idx].children[oct], mass_id, pos, mass, depth + 1); - - // Update the center of mass - const float new_mass = nodes[node_idx].mass_total + mass; - nodes[node_idx].mass_center = (nodes[node_idx].mass_center * nodes[node_idx].mass_total + pos) / new_mass; - nodes[node_idx].mass_total = new_mass; -} - -auto octree::build_octree(octree& t, const std::vector& positions) -> void +// Replaced the 50 line recursive octree insertion with this bitch to gain 5 UPS, FML +auto octree::build_octree_morton(octree& t, const std::vector& positions) -> void { #ifdef TRACY ZoneScoped; #endif t.clear(); - t.reserve(positions.size() * 2); + if (positions.empty()) { + return; + } // Compute bounding box around all masses - Vector3 min{FLT_MAX, FLT_MAX, FLT_MAX}; - Vector3 max{-FLT_MAX, -FLT_MAX, -FLT_MAX}; + Vector3 root_min{FLT_MAX, FLT_MAX, FLT_MAX}; + Vector3 root_max{-FLT_MAX, -FLT_MAX, -FLT_MAX}; for (const auto& [x, y, z] : positions) { - min.x = std::min(min.x, x); - max.x = std::max(max.x, x); - min.y = std::min(min.y, y); - max.y = std::max(max.y, y); - min.z = std::min(min.z, z); - max.z = std::max(max.z, z); + root_min.x = std::min(root_min.x, x); + root_max.x = std::max(root_max.x, x); + root_min.y = std::min(root_min.y, y); + root_max.y = std::max(root_max.y, y); + root_min.z = std::min(root_min.z, z); + root_max.z = std::max(root_max.z, z); } - // Pad the bounding box - constexpr float pad = 1.0; - min = Vector3Subtract(min, Vector3Scale(Vector3One(), pad)); - max = Vector3Add(max, Vector3Scale(Vector3One(), pad)); + constexpr float pad = 1.0f; + root_min = Vector3Subtract(root_min, Vector3Scale(Vector3One(), pad)); + root_max = Vector3Add(root_max, Vector3Scale(Vector3One(), pad)); - // Make it cubic (so subdivisions are balanced) - const float max_extent = std::max({max.x - min.x, max.y - min.y, max.z - min.z}); - max = Vector3Add(min, Vector3Scale(Vector3One(), max_extent)); + const float max_extent = std::max({root_max.x - root_min.x, root_max.y - root_min.y, root_max.z - root_min.z}); + root_max = Vector3Add(root_min, Vector3Scale(Vector3One(), max_extent)); - // Root node spans the entire area - const int root = t.create_empty_leaf(min, max); + const float root_extent = root_max.x - root_min.x; // cubic - for (size_t i = 0; i < positions.size(); ++i) { - t.insert(root, static_cast(i), positions[i], MASS, 0); + // Container for building the particle list before sorting by morton code + struct sort_node + { + uint64_t code; + uint32_t id; + Vector3 pos; + }; + + std::vector sort_container; + sort_container.reserve(positions.size()); + for (uint32_t i = 0; i < positions.size(); ++i) { + sort_container.emplace_back(pos_to_morton(positions[i], root_min, root_max), i, positions[i]); } + + // Sort the list by morton codes. Because positions close to each other have similar morten codes, + // this provides us with "spatial locality" in the datastructure. + auto sort_by_code = [&]() + { + std::ranges::sort(sort_container, + [](const sort_node& a, const sort_node& b) + { + if (a.code != b.code) { + return a.code < b.code; + } + return a.id < b.id; + }); + }; + + sort_by_code(); + + // Resolve duplicates by jittering the later one deterministically and re-encoding. + for (int seed = 0; seed < 8; ++seed) { + bool had_dupes = false; + + for (size_t i = 1; i < sort_container.size(); ++i) { + // Because elements are spatially ordered after sorting, we can scan for duplicates in O(n) + if (sort_container[i].code == sort_container[i - 1].code) { + had_dupes = true; + sort_container[i].pos = jitter_pos(sort_container[i].pos, + sort_container[i].id + seed * 0x9e3779b9u, + root_min, + root_max, + root_extent); + sort_container[i].code = pos_to_morton(sort_container[i].pos, root_min, root_max); + } + } + + if (!had_dupes) { + break; + } + sort_by_code(); + } + + // Sanity check + for (size_t i = 1; i < sort_container.size(); ++i) { + if (sort_container[i].code == sort_container[i - 1].code) { + throw std::runtime_error("Duplicates remain after jittering"); + } + } + + std::vector> tree_levels; + tree_levels.assign(MAX_DEPTH + 1, {}); + + // Leaves at MAX_DEPTH: 1 particle per leaf in morton order (close particles close together) + auto& leafs = tree_levels[MAX_DEPTH]; + leafs.reserve(sort_container.size()); + const float leaf_size = root_extent / static_cast(MAX_DEPTH); + for (const auto& [code, id, pos] : sort_container) { + node leaf; + leaf.leaf = true; + leaf.mass_id = static_cast(id); + leaf.depth = MAX_DEPTH; + leaf.size = leaf_size; + leaf.mass_total = MASS; + leaf.mass_center = pos; // force uses mass_center instead of jittered position + leaf.children.fill(-1); + leafs.push_back(leaf); + } + + // We now have to group the particles (currently we have only sorted the leaves), + // but upwards subdivisions have to be created. + // For grouping, store a nodes local index in its level. + struct leaf + { + uint64_t leaf_code; + int depth; + int level_index; + }; + + std::vector leaves; + leaves.reserve(tree_levels[MAX_DEPTH].size()); + for (int i = 0; i < static_cast(tree_levels[MAX_DEPTH].size()); ++i) { + leaves.emplace_back(sort_container[static_cast(i)].code, MAX_DEPTH, i); + } + + // Build internal levels from MAX_DEPTH-1 to 0 + for (int current_depth = MAX_DEPTH - 1; current_depth >= 0; --current_depth) { + auto& current_level = tree_levels[current_depth]; + current_level.clear(); + + std::vector next_refs; + next_refs.reserve(leaves.size()); + + const float parent_size = root_extent / static_cast(1u << current_depth); + + size_t i = 0; + while (i < leaves.size()) { + const uint64_t key = path_to_ancestor(leaves[i].leaf_code, MAX_DEPTH, current_depth); + + size_t j = i + 1; + while (j < leaves.size() && path_to_ancestor(leaves[j].leaf_code, MAX_DEPTH, current_depth) == key) { + ++j; + } + + const size_t group_size = j - i; + + // Unary compression: just carry the child ref upward unchanged. + if (group_size == 1) { + next_refs.push_back(leaves[i]); + i = j; + continue; + } + + node parent; + parent.leaf = false; + parent.mass_id = -1; + parent.depth = current_depth; + parent.size = parent_size; + parent.children.fill(-1); + + float mass_total = 0.0f; + Vector3 mass_center_acc = Vector3Zero(); + + for (size_t k = i; k < j; ++k) { + const int child_depth = leaves[k].depth; + const int child_local = leaves[k].level_index; + + // Read the child from its actual stored level. + const node& child = tree_levels[child_depth][child_local]; + + // Which octant of this parent does it belong to? + // IMPORTANT: octant comes from the NEXT level after current_depth (i.e. current_depth+1), + // but the child might skip levels due to compression. + // We must use the child's "first level under the parent" which is (current_depth+1). + const int oct = octant_at_level(leaves[k].leaf_code, current_depth + 1, MAX_DEPTH); + + // Store *global* child reference: we only have an int slot, so we need a single index space. + parent.children[oct] = (child_depth << 24) | (child_local & 0x00FFFFFF); + + mass_total += child.mass_total; + mass_center_acc = Vector3Add(mass_center_acc, Vector3Scale(child.mass_center, child.mass_total)); + } + + parent.mass_total = mass_total; + parent.mass_center = (mass_total > 0.0f) ? Vector3Scale(mass_center_acc, 1.0f / mass_total) : Vector3Zero(); + + const int parent_local = static_cast(current_level.size()); + current_level.push_back(parent); + + next_refs.push_back({leaves[i].leaf_code, current_depth, parent_local}); + i = j; + } + + leaves.swap(next_refs); + } + + // Flatten levels and fix child indices from local->global. + // All depth 0 nodes come first, then depth 1, last depth MAX_DEPTH. + std::vector offsets(tree_levels.size(), 0); + int total = 0; + for (int d = 0; d <= MAX_DEPTH; ++d) { + offsets[d] = total; + total += static_cast(tree_levels[d].size()); + } + + t.nodes.clear(); + t.nodes.reserve(total); + for (int d = 0; d <= MAX_DEPTH; ++d) { + t.nodes.insert(t.nodes.end(), tree_levels[d].begin(), tree_levels[d].end()); + } + + // Fix child indices: convert local index in levels[d+1] to global index in t.nodes + for (int d = 0; d <= MAX_DEPTH; ++d) { + const int base = offsets[d]; + for (int i2 = 0; i2 < static_cast(tree_levels[d].size()); ++i2) { + node& n = t.nodes[base + i2]; + if (!n.leaf) { + for (int c = 0; c < 8; ++c) { + int packed = n.children[c]; + if (packed >= 0) { + const int child_depth = (packed >> 24) & 0xFF; + const int child_local = packed & 0x00FFFFFF; + n.children[c] = offsets[child_depth] + child_local; + } + } + } + } + } + + // const size_t _leaves = tree_levels[MAX_DEPTH].size(); + // const size_t _total = t.nodes.size(); + // const size_t _internal = _total - _leaves; + // traceln("Morton octree nodes: {}, leaves: {}, ratio: {:.3} children per internal node.", + // _total, + // _leaves, + // static_cast(_total - 1) / _internal); } -auto octree::calculate_force(const int node_idx, const Vector3& pos) const -> Vector3 +auto octree::calculate_force_morton(const int node_idx, const Vector3& pos, const int self_id) const -> Vector3 { if (node_idx < 0) { return Vector3Zero(); } - const node& n = nodes[node_idx]; + // Force accumulator + float fx = 0.0f; + float fy = 0.0f; + float fz = 0.0f; - const float mass_total = n.mass_total; - const Vector3& mass_center = n.mass_center; - const Vector3& box_min = n.box_min; - const Vector3& box_max = n.box_max; - const std::array& children = n.children; - const bool leaf = n.leaf; + std::vector stack; + stack.reserve(128); + stack.push_back(node_idx); - if (std::abs(mass_total) <= 0.001f) { - return Vector3Zero(); - } + constexpr float theta2 = THETA * THETA; - const Vector3 diff = Vector3Subtract(pos, mass_center); - float dist_sq = diff.x * diff.x + diff.y * diff.y + diff.z * diff.z; + while (!stack.empty()) { + const int idx = stack.back(); + stack.pop_back(); - // Softening - dist_sq += SOFTENING; + const node& n = nodes[idx]; - // Barnes-Hut - const float size = box_max.x - box_min.x; - if (leaf || size * size / dist_sq < THETA * THETA) { - const float dist = std::sqrt(dist_sq); - const float force_mag = BH_FORCE * mass_total / dist_sq; + // No self-force for single-particle leafs + if (n.leaf && n.mass_id == self_id) { + continue; + } - return Vector3Scale(diff, force_mag / dist); - } + const float dx = pos.x - n.mass_center.x; + const float dy = pos.y - n.mass_center.y; + const float dz = pos.z - n.mass_center.z; - // Collect child forces - Vector3 force = Vector3Zero(); - for (const int child : children) { - if (child >= 0) { - const Vector3 child_force = calculate_force(child, pos); + const float dist_sq = dx * dx + dy * dy + dz * dz + SOFTENING; - force = Vector3Add(force, child_force); + // Barnes–Hut criterion + if (n.leaf || ((n.size * n.size) / dist_sq) < theta2) { + const float inv_dist = 1.0f / std::sqrt(dist_sq); + const float force_mag = (BH_FORCE * n.mass_total) / dist_sq; // ~ 1/r^2 + const float s = force_mag * inv_dist; // scale by 1/r to get vector + fx += dx * s; + fy += dy * s; + fz += dz * s; + continue; + } + + for (int c = 0; c < 8; ++c) { + const int child = n.children[c]; + if (child >= 0) { + stack.push_back(child); + } } } - return force; + return Vector3{fx, fy, fz}; } \ No newline at end of file