gpu: always allocate page-sized chunks, then use LinearAllocator

This was neccessary in order to make the memory available via GDRcopy when multiple small allocations were made. cudaMalloc() would return multiple memory chunks located in the same GPU page, which GDRcopy pretty much dislikes (`gdrdrv:offset != 0 is not supported`). As a side effect, this will keep the number of BAR-mappings done via GDRcopy low, because they seem to be quite limited.
2025-03-30 00:00:11 +01:00 · 2018-07-20 16:46:55 +02:00 · 2018-07-20 16:46:55 +02:00 · 8a06e96e92
commit 8a06e96e92
parent 375b6b5cd3
2 changed files with 44 additions and 16 deletions
--- a/fpga/lib/gpu/gpu.cpp
+++ b/fpga/lib/gpu/gpu.cpp
@ -365,29 +365,53 @@ GpuAllocator::allocateBlock(size_t size)
 	cudaSetDevice(gpu.gpuId);

 	void* addr;
-	if(cudaSuccess != cudaMalloc(&addr, size)) {
-		logger->error("cudaMalloc(..., size={}) failed", size);
-		throw std::bad_alloc();
-	}
-
 	auto& mm = MemoryManager::get();

-	// assemble name for this block
-	std::stringstream name;
-	name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
+	// search for an existing chunk that has enough free memory
+	auto chunk = std::find_if(chunks.begin(), chunks.end(), [&](const auto& chunk) {
+		return chunk->getAvailableMemory() >= size;
+	});

-	auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
-	auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);

-	const auto localAddr = reinterpret_cast<uintptr_t>(addr);
-	std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
-	        mem(new MemoryBlock(localAddr, size, blockAddrSpaceId), this->free);
+	if(chunk != chunks.end()) {
+		logger->debug("Found existing chunk that can host the requested block");

-	insertMemoryBlock(*mem);
+		return (*chunk)->allocateBlock(size);

-	gpu.makeAccessibleToPCIeAndVA(*mem);
+	} else {
+		// allocate a new chunk

-	return mem;
+		// rounded-up multiple of GPU page size
+		const size_t chunkSize = size - (size & (GpuPageSize - 1)) + GpuPageSize;
+		logger->debug("Allocate new chunk of {:#x} bytes", chunkSize);
+
+		if(cudaSuccess != cudaMalloc(&addr, chunkSize)) {
+			logger->error("cudaMalloc(..., size={}) failed", chunkSize);
+			throw std::bad_alloc();
+		}
+
+		// assemble name for this block
+		std::stringstream name;
+		name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
+
+		auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
+		auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
+
+		const auto localAddr = reinterpret_cast<uintptr_t>(addr);
+		std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
+		        mem(new MemoryBlock(localAddr, chunkSize, blockAddrSpaceId), this->free);
+
+		insertMemoryBlock(*mem);
+
+		// already make accessible to CPU
+		gpu.makeAccessibleToPCIeAndVA(*mem);
+
+		// create a new allocator to manage the chunk and push to chunk list
+		chunks.push_front(std::make_unique<LinearAllocator>(std::move(mem)));
+
+		// call again, this time there's a large enough chunk
+		return allocateBlock(size);
+	}
 }


--- a/fpga/lib/gpu/include/villas/gpu.hpp
+++ b/fpga/lib/gpu/include/villas/gpu.hpp
@ -62,6 +62,8 @@ private:

 class GpuAllocator : public BaseAllocator<GpuAllocator> {
 public:
+	static constexpr size_t GpuPageSize = 64UL << 10;
+
 	GpuAllocator(Gpu& gpu);

 	std::string getName() const;
@ -71,6 +73,8 @@ public:

 private:
 	Gpu& gpu;
+	// TODO: replace by multimap (key is available memory)
+	std::list<std::unique_ptr<LinearAllocator>> chunks;
 };

 class GpuFactory : public Plugin {