diff --git a/fpga/lib/gpu/gpu.cpp b/fpga/lib/gpu/gpu.cpp index ffb7b7ad3..098ec5310 100644 --- a/fpga/lib/gpu/gpu.cpp +++ b/fpga/lib/gpu/gpu.cpp @@ -365,29 +365,53 @@ GpuAllocator::allocateBlock(size_t size) cudaSetDevice(gpu.gpuId); void* addr; - if(cudaSuccess != cudaMalloc(&addr, size)) { - logger->error("cudaMalloc(..., size={}) failed", size); - throw std::bad_alloc(); - } - auto& mm = MemoryManager::get(); - // assemble name for this block - std::stringstream name; - name << std::showbase << std::hex << reinterpret_cast(addr); + // search for an existing chunk that has enough free memory + auto chunk = std::find_if(chunks.begin(), chunks.end(), [&](const auto& chunk) { + return chunk->getAvailableMemory() >= size; + }); - auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str()); - auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName); - const auto localAddr = reinterpret_cast(addr); - std::unique_ptr - mem(new MemoryBlock(localAddr, size, blockAddrSpaceId), this->free); + if(chunk != chunks.end()) { + logger->debug("Found existing chunk that can host the requested block"); - insertMemoryBlock(*mem); + return (*chunk)->allocateBlock(size); - gpu.makeAccessibleToPCIeAndVA(*mem); + } else { + // allocate a new chunk - return mem; + // rounded-up multiple of GPU page size + const size_t chunkSize = size - (size & (GpuPageSize - 1)) + GpuPageSize; + logger->debug("Allocate new chunk of {:#x} bytes", chunkSize); + + if(cudaSuccess != cudaMalloc(&addr, chunkSize)) { + logger->error("cudaMalloc(..., size={}) failed", chunkSize); + throw std::bad_alloc(); + } + + // assemble name for this block + std::stringstream name; + name << std::showbase << std::hex << reinterpret_cast(addr); + + auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str()); + auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName); + + const auto localAddr = reinterpret_cast(addr); + std::unique_ptr + mem(new MemoryBlock(localAddr, chunkSize, blockAddrSpaceId), this->free); + + insertMemoryBlock(*mem); + + // already make accessible to CPU + gpu.makeAccessibleToPCIeAndVA(*mem); + + // create a new allocator to manage the chunk and push to chunk list + chunks.push_front(std::make_unique(std::move(mem))); + + // call again, this time there's a large enough chunk + return allocateBlock(size); + } } diff --git a/fpga/lib/gpu/include/villas/gpu.hpp b/fpga/lib/gpu/include/villas/gpu.hpp index 00f1464ca..a2eb78efe 100644 --- a/fpga/lib/gpu/include/villas/gpu.hpp +++ b/fpga/lib/gpu/include/villas/gpu.hpp @@ -62,6 +62,8 @@ private: class GpuAllocator : public BaseAllocator { public: + static constexpr size_t GpuPageSize = 64UL << 10; + GpuAllocator(Gpu& gpu); std::string getName() const; @@ -71,6 +73,8 @@ public: private: Gpu& gpu; + // TODO: replace by multimap (key is available memory) + std::list> chunks; }; class GpuFactory : public Plugin {