1
0
Fork 0
mirror of https://git.rwth-aachen.de/acs/public/villas/node/ synced 2025-03-09 00:00:00 +01:00

gpu: always allocate page-sized chunks, then use LinearAllocator

This was neccessary in order to make the memory available via GDRcopy
when multiple small allocations were made. cudaMalloc() would return
multiple memory chunks located in the same GPU page, which GDRcopy
pretty much dislikes (`gdrdrv:offset != 0 is not supported`).
As a side effect, this will keep the number of BAR-mappings done
via GDRcopy low, because they seem to be quite limited.
This commit is contained in:
Daniel Krebs 2018-07-20 16:46:55 +02:00
parent 375b6b5cd3
commit 8a06e96e92
2 changed files with 44 additions and 16 deletions

View file

@ -365,29 +365,53 @@ GpuAllocator::allocateBlock(size_t size)
cudaSetDevice(gpu.gpuId);
void* addr;
if(cudaSuccess != cudaMalloc(&addr, size)) {
logger->error("cudaMalloc(..., size={}) failed", size);
throw std::bad_alloc();
}
auto& mm = MemoryManager::get();
// assemble name for this block
std::stringstream name;
name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
// search for an existing chunk that has enough free memory
auto chunk = std::find_if(chunks.begin(), chunks.end(), [&](const auto& chunk) {
return chunk->getAvailableMemory() >= size;
});
auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
const auto localAddr = reinterpret_cast<uintptr_t>(addr);
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
mem(new MemoryBlock(localAddr, size, blockAddrSpaceId), this->free);
if(chunk != chunks.end()) {
logger->debug("Found existing chunk that can host the requested block");
insertMemoryBlock(*mem);
return (*chunk)->allocateBlock(size);
gpu.makeAccessibleToPCIeAndVA(*mem);
} else {
// allocate a new chunk
return mem;
// rounded-up multiple of GPU page size
const size_t chunkSize = size - (size & (GpuPageSize - 1)) + GpuPageSize;
logger->debug("Allocate new chunk of {:#x} bytes", chunkSize);
if(cudaSuccess != cudaMalloc(&addr, chunkSize)) {
logger->error("cudaMalloc(..., size={}) failed", chunkSize);
throw std::bad_alloc();
}
// assemble name for this block
std::stringstream name;
name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
const auto localAddr = reinterpret_cast<uintptr_t>(addr);
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
mem(new MemoryBlock(localAddr, chunkSize, blockAddrSpaceId), this->free);
insertMemoryBlock(*mem);
// already make accessible to CPU
gpu.makeAccessibleToPCIeAndVA(*mem);
// create a new allocator to manage the chunk and push to chunk list
chunks.push_front(std::make_unique<LinearAllocator>(std::move(mem)));
// call again, this time there's a large enough chunk
return allocateBlock(size);
}
}

View file

@ -62,6 +62,8 @@ private:
class GpuAllocator : public BaseAllocator<GpuAllocator> {
public:
static constexpr size_t GpuPageSize = 64UL << 10;
GpuAllocator(Gpu& gpu);
std::string getName() const;
@ -71,6 +73,8 @@ public:
private:
Gpu& gpu;
// TODO: replace by multimap (key is available memory)
std::list<std::unique_ptr<LinearAllocator>> chunks;
};
class GpuFactory : public Plugin {