diff --git a/.clangd b/.clangd index 72fbebe..602b81e 120000 --- a/.clangd +++ b/.clangd @@ -1 +1 @@ -./cmake-build-debug/.clangd \ No newline at end of file +./cmake-build-release/.clangd \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 272e3b3..2aad6d4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,7 @@ project(ObjRender) set(CMAKE_CXX_STANDARD 23) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=znver4 -mavx512f -mavx512dq -mavx512vl -ffast-math") # -ffast-math -flto find_package(raylib REQUIRED) diff --git a/compile_commands.json b/compile_commands.json index 66636ac..fd9db9d 120000 --- a/compile_commands.json +++ b/compile_commands.json @@ -1 +1 @@ -./cmake-build-debug/compile_commands.json \ No newline at end of file +./cmake-build-release/compile_commands.json \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index ed896c2..2ae4945 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,25 +1,49 @@ #define TINYOBJLOADER_IMPLEMENTATION +#define TIME // Print transformation times +// #define MATRIX_TRANSFORM // Use combined matrix transform +// #define PARALLEL_TRANSFORM +#define AVX_TRANSFORM +#define PRUNE // Prune duplicate edges +#define PRUNE_HASH // Do hash-based pruning + +#ifdef PRUNE #include +#ifdef PRUNE_HASH +#include +#endif // #ifdef PRUNE_HASH +#endif // #ifdef PRUNE + +#ifdef TIME #include +#endif + +#ifndef MATRIX_TRANSFORM #include +#ifdef PARALLEL_TRANSFORM +#include +#elifdef AVX_TRANSFORM +#include +#endif // #ifdef PARALLEL_TRANSFORM +#endif // #ifdef MATRIX_TRANSFORM + #include -#include #include +#include #include #include constexpr int WIDTH = 800; constexpr int HEIGHT = 800; +constexpr float VERTEX_SIZE = 1.5; +constexpr Color VERTEX_COLOR = {27, 188, 104, 255}; +constexpr Color EDGE_COLOR = {20, 133, 38, 255}; constexpr float SPEED = 1.0; constexpr float CAMERA_DISTANCE = 2.2; using Edge2Set = std::vector>; using Edge3Set = std::vector>; -// constexpr Color EDGE_COLOR = {27, 188, 104, 255}; -constexpr Color EDGE_COLOR = {20, 133, 38, 255}; - auto parse_obj_file(Edge3Set &result, const std::string_view path) -> void { tinyobj::attrib_t attrib; std::vector shapes; @@ -80,6 +104,43 @@ auto parse_obj_file(Edge3Set &result, const std::string_view path) -> void { << std::endl; } +#ifdef PRUNE +#ifdef PRUNE_HASH +struct Vector3Hash { + size_t operator()(const Vector3 &v) const { + return std::hash()(v.x) ^ (std::hash()(v.y) << 1) ^ + (std::hash()(v.z) << 2); + } +}; + +struct EdgeHash { + size_t operator()(const std::pair &e) const { + Vector3Hash h; + return h(e.first) ^ (h(e.second) << 1); + } +}; + +auto prune_edges(Edge3Set &result, const Edge3Set &edges) -> void { + std::unordered_set, EdgeHash> seen; + + for (const auto &edge : edges) { + auto normalized = + (edge.first.x < edge.second.x || + (edge.first.x == edge.second.x && edge.first.y < edge.second.y) || + (edge.first.x == edge.second.x && edge.first.y == edge.second.y && + edge.first.z < edge.second.z)) + ? edge + : std::make_pair(edge.second, edge.first); + + if (seen.insert(normalized).second) { + result.emplace_back(edge); + } + } + + std::cout << "Found " << edges.size() - result.size() << " duplicate edges." + << std::endl; +} +#else auto prune_edges(Edge3Set &result, const Edge3Set &edges) -> void { auto eq = [](const float a, const float b) -> bool { return fabs(a - b) <= 0.001; @@ -106,45 +167,205 @@ auto prune_edges(Edge3Set &result, const Edge3Set &edges) -> void { std::cout << "Found " << edges.size() - result.size() << " duplicate edges." << std::endl; } +#endif // #ifdef PRUNE_HASH +#endif // #ifdef PRUNE -auto to_viewport(Edge2Set &result, const Edge2Set &edges) -> void { +#ifdef MATRIX_TRANSFORM +auto matrix_transform(Edge2Set &result, const Edge3Set &edges, + const Camera &camera, const Matrix &model_transformation) + -> void { for (const auto &[a, b] : edges) { - result.emplace_back( - Vector2((a.x + 1.0) / 2.0 * WIDTH, - (1.0 - (a.y + 1.0)) / 2.0 * HEIGHT + HEIGHT / 2.0), - Vector2((b.x + 1.0) / 2.0 * WIDTH, - (1.0 - (b.y + 1.0)) / 2.0 * HEIGHT + HEIGHT / 2.0)); + const Vector3 modelA = Vector3Transform(a, model_transformation); + const Vector3 modelB = Vector3Transform(b, model_transformation); + const Vector2 screenA = GetWorldToScreen(modelA, camera); + const Vector2 screenB = GetWorldToScreen(modelB, camera); + result.emplace_back(screenA, screenB); } } +#else +auto manual_transform(Edge2Set &result, const Edge3Set &edges, + const float angle, const float distance) -> void { + const float cos_angle = cos(angle); + const float sin_angle = sin(angle); -auto to_imageplane(Edge2Set &result, const Edge3Set &edges) -> void { - for (const auto &[a, b] : edges) { - result.emplace_back(Vector2(a.x / a.z, a.y / a.z), - Vector2(b.x / b.z, b.y / b.z)); + auto rotate = [&](const Vector3 &a) -> Vector3 { + return Vector3(a.x * cos_angle - a.z * sin_angle, a.y, + a.x * sin_angle + a.z * cos_angle); + }; + + auto translate = [&](const Vector3 &a) -> Vector3 { + return Vector3(a.x, a.y, a.z + distance); + }; + + auto project = [&](const Vector3 &a) -> Vector2 { + return Vector2(a.x / a.z, a.y / a.z); + }; + + auto map = [&](const Vector2 &a) -> Vector2 { + return Vector2((a.x + 1.0) / 2.0 * WIDTH, + (1.0 - (a.y + 1.0)) / 2.0 * HEIGHT + HEIGHT / 2.0); + }; + +#ifdef PARALLEL_TRANSFORM + result.resize(edges.size()); + std::transform( + std::execution::par_unseq, edges.begin(), edges.end(), result.begin(), + [&](const auto &edge) -> std::pair { + const Vector2 at = map(project(translate(rotate(edge.first)))); + const Vector2 bt = map(project(translate(rotate(edge.second)))); + + return std::make_pair(at, bt); + }); +#elifdef AVX_TRANSFORM + result.resize(edges.size()); + + // Broadcast constants to all 16 lanes + const __m512 cos_a = _mm512_set1_ps(cos(angle)); + const __m512 sin_a = _mm512_set1_ps(sin(angle)); + const __m512 dist = _mm512_set1_ps(distance); + const __m512 half_width = _mm512_set1_ps(WIDTH * 0.5f); + const __m512 half_height = _mm512_set1_ps(HEIGHT * 0.5f); + const __m512 one = _mm512_set1_ps(1.0f); + + size_t i = 0; + + // Process 8 edges at a time (16 points total) + for (; i + 7 < edges.size(); i += 8) { + // Load 8 edge start points (interleaved) + __m512 ax, ay, az, bx, by, bz; + + // Gather x coordinates for 8 start points + ax = _mm512_set_ps( + edges[i + 7].first.x, edges[i + 6].first.x, edges[i + 5].first.x, + edges[i + 4].first.x, edges[i + 3].first.x, edges[i + 2].first.x, + edges[i + 1].first.x, edges[i].first.x, edges[i + 7].first.x, + edges[i + 6].first.x, edges[i + 5].first.x, edges[i + 4].first.x, + edges[i + 3].first.x, edges[i + 2].first.x, edges[i + 1].first.x, + edges[i].first.x); + + ay = _mm512_set_ps( + edges[i + 7].first.y, edges[i + 6].first.y, edges[i + 5].first.y, + edges[i + 4].first.y, edges[i + 3].first.y, edges[i + 2].first.y, + edges[i + 1].first.y, edges[i].first.y, edges[i + 7].first.y, + edges[i + 6].first.y, edges[i + 5].first.y, edges[i + 4].first.y, + edges[i + 3].first.y, edges[i + 2].first.y, edges[i + 1].first.y, + edges[i].first.y); + + az = _mm512_set_ps( + edges[i + 7].first.z, edges[i + 6].first.z, edges[i + 5].first.z, + edges[i + 4].first.z, edges[i + 3].first.z, edges[i + 2].first.z, + edges[i + 1].first.z, edges[i].first.z, edges[i + 7].first.z, + edges[i + 6].first.z, edges[i + 5].first.z, edges[i + 4].first.z, + edges[i + 3].first.z, edges[i + 2].first.z, edges[i + 1].first.z, + edges[i].first.z); + + // Gather x,y,z for 8 end points + bx = _mm512_set_ps( + edges[i + 7].second.x, edges[i + 6].second.x, edges[i + 5].second.x, + edges[i + 4].second.x, edges[i + 3].second.x, edges[i + 2].second.x, + edges[i + 1].second.x, edges[i].second.x, edges[i + 7].second.x, + edges[i + 6].second.x, edges[i + 5].second.x, edges[i + 4].second.x, + edges[i + 3].second.x, edges[i + 2].second.x, edges[i + 1].second.x, + edges[i].second.x); + + by = _mm512_set_ps( + edges[i + 7].second.y, edges[i + 6].second.y, edges[i + 5].second.y, + edges[i + 4].second.y, edges[i + 3].second.y, edges[i + 2].second.y, + edges[i + 1].second.y, edges[i].second.y, edges[i + 7].second.y, + edges[i + 6].second.y, edges[i + 5].second.y, edges[i + 4].second.y, + edges[i + 3].second.y, edges[i + 2].second.y, edges[i + 1].second.y, + edges[i].second.y); + + bz = _mm512_set_ps( + edges[i + 7].second.z, edges[i + 6].second.z, edges[i + 5].second.z, + edges[i + 4].second.z, edges[i + 3].second.z, edges[i + 2].second.z, + edges[i + 1].second.z, edges[i].second.z, edges[i + 7].second.z, + edges[i + 6].second.z, edges[i + 5].second.z, edges[i + 4].second.z, + edges[i + 3].second.z, edges[i + 2].second.z, edges[i + 1].second.z, + edges[i].second.z); + + // Rotate: x' = x*cos - z*sin, z' = x*sin + z*cos + __m512 ax_rot = _mm512_fmsub_ps(ax, cos_a, _mm512_mul_ps(az, sin_a)); + __m512 az_rot = _mm512_fmadd_ps(ax, sin_a, _mm512_mul_ps(az, cos_a)); + __m512 bx_rot = _mm512_fmsub_ps(bx, cos_a, _mm512_mul_ps(bz, sin_a)); + __m512 bz_rot = _mm512_fmadd_ps(bx, sin_a, _mm512_mul_ps(bz, cos_a)); + + // Translate z + az_rot = _mm512_add_ps(az_rot, dist); + bz_rot = _mm512_add_ps(bz_rot, dist); + + // Project: x/z, y/z + __m512 ax_proj = _mm512_div_ps(ax_rot, az_rot); + __m512 ay_proj = _mm512_div_ps(ay, az_rot); + __m512 bx_proj = _mm512_div_ps(bx_rot, bz_rot); + __m512 by_proj = _mm512_div_ps(by, bz_rot); + + // Map to screen: (proj + 1) * width/2 + __m512 ax_screen = _mm512_mul_ps(_mm512_add_ps(ax_proj, one), half_width); + __m512 ay_screen = + _mm512_fmadd_ps(_mm512_sub_ps(one, _mm512_add_ps(ay_proj, one)), + half_height, half_height); + __m512 bx_screen = _mm512_mul_ps(_mm512_add_ps(bx_proj, one), half_width); + __m512 by_screen = + _mm512_fmadd_ps(_mm512_sub_ps(one, _mm512_add_ps(by_proj, one)), + half_height, half_height); + + // Store results + alignas(64) float ax_out[16], ay_out[16], bx_out[16], by_out[16]; + _mm512_store_ps(ax_out, ax_screen); + _mm512_store_ps(ay_out, ay_screen); + _mm512_store_ps(bx_out, bx_screen); + _mm512_store_ps(by_out, by_screen); + + // Extract to result vector + for (size_t j = 0; j < 8; ++j) { + result[i + j] = {{ax_out[j], ay_out[j]}, {bx_out[j], by_out[j]}}; + } } -} -auto translate_forward(Edge3Set &result, const Edge3Set &edges, - const float distance) -> void { - for (const auto &[a, b] : edges) { - result.emplace_back(Vector3(a.x, a.y, a.z + distance), - Vector3(b.x, b.y, b.z + distance)); + // Handle remaining edges with scalar code + for (; i < edges.size(); ++i) { + const auto &[a, b] = edges[i]; + + auto rotate = [angle](const Vector3 &v) -> Vector3 { + return Vector3(v.x * cos(angle) - v.z * sin(angle), v.y, + v.x * sin(angle) + v.z * cos(angle)); + }; + + auto translate = [distance](const Vector3 &v) -> Vector3 { + return Vector3(v.x, v.y, v.z + distance); + }; + + auto project = [](const Vector3 &v) -> Vector2 { + return Vector2(v.x / v.z, v.y / v.z); + }; + + auto map = [](const Vector2 &v) -> Vector2 { + return Vector2((v.x + 1) * WIDTH / 2, + (1 - (v.y + 1)) * HEIGHT / 2.0 + HEIGHT / 2.0); + }; + + auto at = map(project(translate(rotate(a)))); + auto bt = map(project(translate(rotate(b)))); + + result[i] = {at, bt}; } -} - -auto rotate_upwards(Edge3Set &result, const Edge3Set &edges, - const float abstime) -> void { +#else for (const auto &[a, b] : edges) { - result.emplace_back(Vector3(a.x * cos(abstime) - a.z * sin(abstime), a.y, - a.x * sin(abstime) + a.z * cos(abstime)), - Vector3(b.x * cos(abstime) - b.z * sin(abstime), b.y, - b.x * sin(abstime) + b.z * cos(abstime))); + const Vector2 at = map(project(translate(rotate(a)))); + const Vector2 bt = map(project(translate(rotate(b)))); + result.emplace_back(at, bt); } +#endif } +#endif -auto draw(const Edge2Set &edges) -> void { +auto draw_edges(const Edge2Set &edges) -> void { for (const auto &[a, b] : edges) { DrawLine(a.x, a.y, b.x, b.y, EDGE_COLOR); + // DrawCircle(a.x, a.y, VERTEX_SIZE, VERTEX_COLOR); + // std::cout << "Drawing (" << a.x << ", " << a.y << ") -> (" << b.x << ", " + // << b.y << ")" << std::endl; } } @@ -154,64 +375,119 @@ auto main(int argc, char *argv[]) -> int { return 1; } - // SetTargetFPS(60); - SetConfigFlags(FLAG_VSYNC_HINT); - // SetConfigFlags(FLAG_MSAA_4X_HINT); + SetTraceLogLevel(LOG_ERROR); - raylib::Window window(WIDTH, HEIGHT, "ObjRender"); + // SetTargetFPS(60); + // SetConfigFlags(FLAG_VSYNC_HINT); + SetConfigFlags(FLAG_MSAA_4X_HINT); + + InitWindow(WIDTH, HEIGHT, "ObjRender"); Edge3Set edges; parse_obj_file(edges, argv[1]); +#ifdef PRUNE Edge3Set pruned; + +#ifdef TIME + std::chrono::high_resolution_clock::time_point start_prune = + std::chrono::high_resolution_clock::now(); +#endif // #ifdef TIME prune_edges(pruned, edges); +#ifdef TIME + std::chrono::high_resolution_clock::time_point end_prune = + std::chrono::high_resolution_clock::now(); + std::chrono::duration prune_time = + end_prune - start_prune; + std::cout << "Edge pruning took " << prune_time << "." << std::endl; +#endif // #ifdef TIME + +#endif // #ifdef PRUNE + +#ifdef MATRIX_TRANSFORM + Camera3D camera = Camera3D(Vector3(0.0, 0.0, -1.0 * CAMERA_DISTANCE), + Vector3(0.0, 0.0, 1.0), Vector3(0.0, 1.0, 0.0), + 90.0, CAMERA_PERSPECTIVE); + + Matrix translation = MatrixTranslate(0.0, 0.0, CAMERA_DISTANCE); +#endif - // TODO: Replace this with combined matrix transform - Edge3Set rotated; - Edge3Set translated; - Edge2Set projected; Edge2Set viewport; - rotated.reserve(pruned.size()); - translated.reserve(pruned.size()); - projected.reserve(pruned.size()); +#ifdef PRUNE viewport.reserve(pruned.size()); +#else + viewport.reserve(edges.size()); +#endif - double last_print = window.GetTime(); - int measure_count = 0; +#ifdef TIME + double last_print_time = GetTime(); + std::chrono::duration time_accumulator = + std::chrono::duration(0); + int time_measure_count = 0; +#endif + + RenderTexture2D render_target; + render_target = LoadRenderTexture(WIDTH, HEIGHT); float abstime = 0.0; - while (!window.ShouldClose()) { - double time = window.GetTime(); + while (!WindowShouldClose()) { - std::chrono::high_resolution_clock::time_point start = +#ifdef TIME + double time = GetTime(); + std::chrono::high_resolution_clock::time_point start_transform = std::chrono::high_resolution_clock::now(); +#endif + +#ifdef MATRIX_TRANSFORM + Matrix rotation = MatrixRotateY(abstime); - rotated.clear(); - translated.clear(); - projected.clear(); viewport.clear(); - rotate_upwards(rotated, pruned, abstime); - translate_forward(translated, rotated, CAMERA_DISTANCE); - to_imageplane(projected, translated); - to_viewport(viewport, projected); +#ifdef PRUNE + matrix_transform(viewport, pruned, camera, rotation); +#else + matrix_transform(viewport, edges, camera, rotation); +#endif - // TODO: Calculate second average - if (time - last_print > 1.0) { - std::chrono::high_resolution_clock::time_point end = - std::chrono::high_resolution_clock::now(); - std::chrono::duration ms_double = end - start; - std::cout << "Transformation took " << ms_double << "." << std::endl; - last_print = time; +#else + viewport.clear(); +#ifdef PRUNE + manual_transform(viewport, pruned, abstime, CAMERA_DISTANCE); +#else + manual_transform(viewport, edges, abstime, CAMERA_DISTANCE); +#endif // #ifdef PRUNE + +#endif // #ifdef MATRIX_TRANSFORM + +#ifdef TIME + std::chrono::high_resolution_clock::time_point end_transform = + std::chrono::high_resolution_clock::now(); + time_accumulator += end_transform - start_transform; + time_measure_count++; + if (time - last_print_time > 5.0) { + std::cout << "Transformation time avg: " + << time_accumulator / time_measure_count << "." << std::endl; + last_print_time = time; + time_accumulator = std::chrono::duration(0); + time_measure_count = 0; } +#endif - window.ClearBackground(RAYWHITE); + BeginTextureMode(render_target); + ClearBackground(RAYWHITE); + draw_edges(viewport); + EndTextureMode(); - window.BeginDrawing(); - draw(viewport); - window.EndDrawing(); + BeginDrawing(); + DrawTextureRec(render_target.texture, + Rectangle(0, 0, (float)WIDTH, -(float)HEIGHT), Vector2(0, 0), + WHITE); + DrawFPS(10, 10); + EndDrawing(); - abstime += window.GetFrameTime() * SPEED; + abstime += GetFrameTime() * SPEED; } + UnloadRenderTexture(render_target); + return 0; }