From 35c2253a5fe9a855fb1496d5a612f84b2af1d13a Mon Sep 17 00:00:00 2001 From: Ishwar Bhati Date: Wed, 25 Mar 2026 10:20:53 -0700 Subject: [PATCH] Add MMapAllocator for SSD-backed memory-mapped data allocation Add MMapAllocator that allocates memory via mmap-backed files on a configurable path (e.g. NVMe/SSD). This enables placing secondary data structures on SSD while keeping primary data in RAM, reducing heap memory usage for large-scale vector search. - include/svs/core/allocator_mmap.h: MMapAllocator with configurable base path, access hints (Sequential/Random/Normal), and automatic file cleanup on deallocation. - tests/svs/core/test_allocator_mmap.cpp: Unit tests for allocation, deallocation, file creation, and access hint propagation. - tests/CMakeLists.txt: Register mmap allocator test. --- include/svs/core/allocator_mmap.h | 276 ++++++++++++++++++++++++++++++ tests/CMakeLists.txt | 1 + tests/svs/core/allocator_mmap.cpp | 213 +++++++++++++++++++++++ 3 files changed, 490 insertions(+) create mode 100644 include/svs/core/allocator_mmap.h create mode 100644 tests/svs/core/allocator_mmap.cpp diff --git a/include/svs/core/allocator_mmap.h b/include/svs/core/allocator_mmap.h new file mode 100644 index 000000000..a29aadb0f --- /dev/null +++ b/include/svs/core/allocator_mmap.h @@ -0,0 +1,276 @@ +/* + * Copyright 2026 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "svs/core/allocator.h" +#include "svs/lib/exception.h" +#include "svs/lib/memory.h" + +#include "fmt/core.h" +#include "tsl/robin_map.h" + +#include +#include +#include +#include +#include +#include + +namespace svs { + +namespace detail { + +/// +/// @brief Manager for file-backed memory mapped allocations +/// +/// Tracks memory-mapped allocations by keeping MMapPtr objects alive. +/// Thread-safe for concurrent allocations. +/// +class MMapAllocationManager { + public: + MMapAllocationManager() = default; + + /// + /// @brief Allocate memory mapped to a file + /// + /// @param bytes Number of bytes to allocate + /// @param file_path Path to the file for backing storage + /// @return Pointer to the allocated memory + /// + [[nodiscard]] void* allocate(size_t bytes, const std::filesystem::path& file_path) { + MemoryMapper mapper{MemoryMapper::ReadWrite, MemoryMapper::MayCreate}; + auto mmap_ptr = mapper.mmap(file_path, lib::Bytes(bytes)); + + void* ptr = mmap_ptr.data(); + + // Store the MMapPtr to keep the mapping alive + { + std::lock_guard lock{mutex_}; + allocations_.insert({ptr, std::move(mmap_ptr)}); + } + + return ptr; + } + + /// + /// @brief Deallocate memory mapped allocation + /// + /// Removes the MMapPtr, which triggers munmap in its destructor + /// + /// @param ptr Pointer to deallocate + /// + static void deallocate(void* ptr) { + std::lock_guard lock{mutex_}; + auto itr = allocations_.find(ptr); + if (itr == allocations_.end()) { + throw ANNEXCEPTION("Could not find memory-mapped allocation to deallocate!"); + } + + // Erasing will destroy the MMapPtr, which calls munmap + allocations_.erase(itr); + } + + /// + /// @brief Get count of current allocations (for debugging/testing) + /// + static size_t allocation_count() { + std::lock_guard lock{mutex_}; + return allocations_.size(); + } + + private: + inline static std::mutex mutex_{}; + inline static tsl::robin_map> allocations_{}; +}; + +} // namespace detail + +/// +/// @brief File-backed memory-mapped allocator for LeanVec secondary data +/// +/// This allocator uses memory-mapped files to store data on SSD rather than RAM. +/// It's particularly useful for the secondary (full-dimension) dataset in LeanVec, +/// which is accessed less frequently during search. +/// +/// @tparam T The value type for the allocator +/// +/// +/// @brief Access pattern hint for memory-mapped allocations +/// +enum class MMapAccessHint { + Normal, ///< Default access pattern + Sequential, ///< Data will be accessed sequentially + Random ///< Data will be accessed randomly +}; + +template class MMapAllocator { + private: + std::filesystem::path base_path_; + size_t allocation_counter_ = 0; + MMapAccessHint access_hint_ = MMapAccessHint::Normal; + + public: + // C++ allocator type aliases + using value_type = T; + using propagate_on_container_copy_assignment = std::true_type; + using propagate_on_container_move_assignment = std::true_type; + using propagate_on_container_swap = std::true_type; + using is_always_equal = + std::false_type; // Allocators with different paths are different + + /// + /// @brief Construct a new MMapAllocator + /// + /// @param base_path Directory path for storing memory-mapped files. + /// If empty, will use /tmp with generated names. + /// @param access_hint Hint about how the data will be accessed + /// + explicit MMapAllocator( + std::filesystem::path base_path = {}, + MMapAccessHint access_hint = MMapAccessHint::Normal + ) + : base_path_{std::move(base_path)} + , access_hint_{access_hint} { + if (!base_path_.empty() && !std::filesystem::exists(base_path_)) { + std::filesystem::create_directories(base_path_); + } + } + + // Enable rebinding of allocators + template friend class MMapAllocator; + + template + MMapAllocator(const MMapAllocator& other) + : base_path_{other.base_path_} + , allocation_counter_{other.allocation_counter_} + , access_hint_{other.access_hint_} {} + + /// + /// @brief Compare allocators + /// + /// Two allocators are equal if they use the same base path and access hint + /// + template bool operator==(const MMapAllocator& other) const { + return base_path_ == other.base_path_ && access_hint_ == other.access_hint_; + } + + /// + /// @brief Allocate memory + /// + /// Creates a memory-mapped file and returns a pointer to it. + /// Applies madvise hints based on the access hint. + /// + /// @param n Number of elements to allocate + /// @return Pointer to allocated memory + /// + [[nodiscard]] T* allocate(size_t n) { + size_t bytes = sizeof(T) * n; + + // Generate unique file path + auto file_path = generate_file_path(bytes); + + void* ptr = detail::MMapAllocationManager{}.allocate(bytes, file_path); + + // Apply madvise hint if on Linux + apply_access_hint(ptr, bytes); + + return static_cast(ptr); + } + + /// + /// @brief Deallocate memory + /// + /// Unmaps the memory-mapped file and cleans up. + /// + /// @param ptr Pointer to deallocate + /// @param n Number of elements (unused but required by allocator interface) + /// + void deallocate(void* ptr, size_t SVS_UNUSED(n)) { + detail::MMapAllocationManager::deallocate(ptr); + } + + /// + /// @brief Construct an object + /// + /// Performs default initialization of the object. + /// + void construct(T* ptr) { ::new (static_cast(ptr)) T; } + + /// + /// @brief Get the base path for allocations + /// + const std::filesystem::path& get_base_path() const { return base_path_; } + + /// + /// @brief Get the access hint + /// + MMapAccessHint get_access_hint() const { return access_hint_; } + + /// + /// @brief Set the access hint for future allocations + /// + void set_access_hint(MMapAccessHint hint) { access_hint_ = hint; } + + private: + /// + /// @brief Apply madvise hint based on access pattern + /// + void apply_access_hint(void* ptr, size_t bytes) const { +#ifdef __linux__ + if (ptr == nullptr || bytes == 0) { + return; + } + + int advice = MADV_NORMAL; + switch (access_hint_) { + case MMapAccessHint::Normal: + advice = MADV_NORMAL; + break; + case MMapAccessHint::Sequential: + advice = MADV_SEQUENTIAL; + break; + case MMapAccessHint::Random: + advice = MADV_RANDOM; + break; + } + + // madvise is a hint, so ignore errors + (void)madvise(ptr, bytes, advice); +#else + (void)ptr; + (void)bytes; +#endif + } + /// + /// @brief Generate a unique file path for an allocation + /// + std::filesystem::path generate_file_path(size_t bytes) { + auto filename = fmt::format( + "mmap_alloc_{}_{}_{}.dat", + std::this_thread::get_id(), + allocation_counter_++, + bytes + ); + + if (base_path_.empty()) { + return std::filesystem::temp_directory_path() / filename; + } + return base_path_ / filename; + } +}; + +} // namespace svs diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 95555eb15..0f5b65903 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -102,6 +102,7 @@ set(TEST_SOURCES ${TEST_DIR}/svs/concepts/distance.cpp # Core ${TEST_DIR}/svs/core/allocator.cpp + ${TEST_DIR}/svs/core/allocator_mmap.cpp ${TEST_DIR}/svs/core/compact.cpp ${TEST_DIR}/svs/core/data.cpp ${TEST_DIR}/svs/core/data/block.cpp diff --git a/tests/svs/core/allocator_mmap.cpp b/tests/svs/core/allocator_mmap.cpp new file mode 100644 index 000000000..68f0e7b7c --- /dev/null +++ b/tests/svs/core/allocator_mmap.cpp @@ -0,0 +1,213 @@ +/* + * Copyright 2026 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Test file-backed MMapAllocator +#include "svs/core/allocator_mmap.h" +#include "svs/core/data.h" + +#include "catch2/catch_test_macros.hpp" + +#include +#include + +namespace { + +CATCH_TEST_CASE("MMapAllocator Basic Operations", "[allocator][mmap]") { + auto temp_dir = std::filesystem::temp_directory_path() / "svs_mmap_test"; + std::filesystem::create_directories(temp_dir); + + CATCH_SECTION("Allocate and deallocate") { + svs::MMapAllocator alloc(temp_dir); + + constexpr size_t n = 1000; + float* ptr = alloc.allocate(n); + CATCH_REQUIRE(ptr != nullptr); + + // Write to the allocated memory + for (size_t i = 0; i < n; ++i) { + ptr[i] = static_cast(i); + } + + // Read back and verify + for (size_t i = 0; i < n; ++i) { + CATCH_REQUIRE(ptr[i] == static_cast(i)); + } + + // Deallocate + alloc.deallocate(ptr, n); + } + + CATCH_SECTION("Multiple allocations") { + svs::MMapAllocator alloc(temp_dir); + + std::vector ptrs; + constexpr size_t num_allocs = 5; + constexpr size_t alloc_size = 100; + + // Allocate multiple blocks + for (size_t i = 0; i < num_allocs; ++i) { + int* ptr = alloc.allocate(alloc_size); + CATCH_REQUIRE(ptr != nullptr); + ptrs.push_back(ptr); + + // Initialize + for (size_t j = 0; j < alloc_size; ++j) { + ptr[j] = static_cast(i * 1000 + j); + } + } + + // Verify all allocations + for (size_t i = 0; i < num_allocs; ++i) { + for (size_t j = 0; j < alloc_size; ++j) { + CATCH_REQUIRE(ptrs[i][j] == static_cast(i * 1000 + j)); + } + } + + // Deallocate all + for (size_t i = 0; i < num_allocs; ++i) { + alloc.deallocate(ptrs[i], alloc_size); + } + } + + CATCH_SECTION("Large allocation") { + svs::MMapAllocator alloc(temp_dir); + + constexpr size_t n = 1'000'000; // 1 million doubles + double* ptr = alloc.allocate(n); + CATCH_REQUIRE(ptr != nullptr); + + // Spot check some values + ptr[0] = 1.0; + ptr[n / 2] = 2.0; + ptr[n - 1] = 3.0; + + CATCH_REQUIRE(ptr[0] == 1.0); + CATCH_REQUIRE(ptr[n / 2] == 2.0); + CATCH_REQUIRE(ptr[n - 1] == 3.0); + + alloc.deallocate(ptr, n); + } + + CATCH_SECTION("Default path (temp directory)") { + svs::MMapAllocator alloc; // No path specified + + constexpr size_t n = 50; + int* ptr = alloc.allocate(n); + CATCH_REQUIRE(ptr != nullptr); + + for (size_t i = 0; i < n; ++i) { + ptr[i] = static_cast(i * 2); + } + + for (size_t i = 0; i < n; ++i) { + CATCH_REQUIRE(ptr[i] == static_cast(i * 2)); + } + + alloc.deallocate(ptr, n); + } + + // Cleanup + std::filesystem::remove_all(temp_dir); +} + +CATCH_TEST_CASE("MMapAllocator with SimpleData", "[allocator][mmap][integration]") { + auto temp_dir = std::filesystem::temp_directory_path() / "svs_mmap_simpledata_test"; + std::filesystem::create_directories(temp_dir); + + CATCH_SECTION("SimpleData with MMapAllocator") { + constexpr size_t num_vectors = 100; + constexpr size_t dims = 128; + + using Alloc = svs::MMapAllocator; + using Data = svs::data::SimpleData; + + // Create data with file-backed allocator + Data data(num_vectors, dims, Alloc{temp_dir}); + + // Write data + for (size_t i = 0; i < num_vectors; ++i) { + auto datum = data.get_datum(i); + for (size_t j = 0; j < dims; ++j) { + datum[j] = static_cast(i * dims + j); + } + } + + // Verify data + for (size_t i = 0; i < num_vectors; ++i) { + auto datum = data.get_datum(i); + for (size_t j = 0; j < dims; ++j) { + CATCH_REQUIRE(datum[j] == static_cast(i * dims + j)); + } + } + + CATCH_REQUIRE(data.size() == num_vectors); + CATCH_REQUIRE(data.dimensions() == dims); + } + + // Cleanup + std::filesystem::remove_all(temp_dir); +} + +CATCH_TEST_CASE("MMapAllocator Rebinding", "[allocator][mmap]") { + auto temp_dir = std::filesystem::temp_directory_path() / "svs_mmap_rebind_test"; + std::filesystem::create_directories(temp_dir); + + CATCH_SECTION("Rebind allocator") { + svs::MMapAllocator float_alloc(temp_dir); + svs::MMapAllocator int_alloc(float_alloc); + + // Both should use the same path + CATCH_REQUIRE(float_alloc.get_base_path() == int_alloc.get_base_path()); + + // Test allocations with rebound allocator + int* ptr = int_alloc.allocate(10); + CATCH_REQUIRE(ptr != nullptr); + ptr[0] = 42; + CATCH_REQUIRE(ptr[0] == 42); + int_alloc.deallocate(ptr, 10); + } + + // Cleanup + std::filesystem::remove_all(temp_dir); +} + +CATCH_TEST_CASE("MMapAllocator Equality", "[allocator][mmap]") { + auto temp_dir1 = std::filesystem::temp_directory_path() / "svs_mmap_eq1"; + auto temp_dir2 = std::filesystem::temp_directory_path() / "svs_mmap_eq2"; + + CATCH_SECTION("Same path allocators are equal") { + svs::MMapAllocator alloc1(temp_dir1); + svs::MMapAllocator alloc2(temp_dir1); + + CATCH_REQUIRE(alloc1 == alloc2); + } + + CATCH_SECTION("Different path allocators are not equal") { + svs::MMapAllocator alloc1(temp_dir1); + svs::MMapAllocator alloc2(temp_dir2); + + CATCH_REQUIRE_FALSE(alloc1 == alloc2); + } + + CATCH_SECTION("Rebound allocators with same path are equal") { + svs::MMapAllocator float_alloc(temp_dir1); + svs::MMapAllocator int_alloc(float_alloc); + + CATCH_REQUIRE(float_alloc == int_alloc); + } +} + +} // anonymous namespace