1
0
Fork 0
mirror of https://git.rwth-aachen.de/acs/public/villas/node/ synced 2025-03-23 00:00:01 +01:00
VILLASnode/fpga/lib/gpu/gpu.cpp

528 lines
15 KiB
C++

/** GPU managment.
*
* @file
* @author Daniel Krebs <github@daniel-krebs.net>
* @copyright 2017-2018, Institute for Automation of Complex Power Systems, EONERC
* @license GNU General Public License (version 3)
*
* VILLASfpga
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*********************************************************************************/
#include <cstdio>
#include <cstdint>
#include <sys/mman.h>
#include <memory>
#include <algorithm>
#include <villas/gpu.hpp>
#include <villas/log.hpp>
#include <villas/kernel/pci.h>
#include <villas/memory_manager.hpp>
#include <cuda.h>
#include <cuda_runtime.h>
#include <gdrapi.h>
#include "kernels.hpp"
namespace villas {
namespace gpu {
static GpuFactory gpuFactory;
GpuAllocator::GpuAllocator(Gpu& gpu) :
BaseAllocator(gpu.masterPciEAddrSpaceId),
gpu(gpu)
{
free = [&](MemoryBlock* mem) {
cudaSetDevice(gpu.gpuId);
if(cudaFree(reinterpret_cast<void*>(mem->getOffset())) != cudaSuccess) {
logger->warn("cudaFree() failed for {:#x} of size {:#x}",
mem->getOffset(), mem->getSize());
}
removeMemoryBlock(*mem);
};
}
std::string
villas::gpu::GpuAllocator::getName() const
{
std::stringstream name;
name << "GpuAlloc" << getAddrSpaceId();
return name.str();
}
GpuFactory::GpuFactory() :
Plugin(Plugin::Type::Gpu, "GPU")
{
logger = loggerGetOrCreate("GpuFactory");
}
// required to be defined here for PIMPL to compile
Gpu::~Gpu()
{
auto& mm = MemoryManager::get();
mm.removeAddressSpace(masterPciEAddrSpaceId);
}
// we use PIMPL in order to hide gdrcopy types from the public header
class Gpu::impl {
public:
gdr_t gdr;
struct pci_device pdev;
};
std::string Gpu::getName() const
{
cudaDeviceProp deviceProp;
if(cudaGetDeviceProperties(&deviceProp, gpuId) != cudaSuccess) {
// logger not yet availabe
loggerGetOrCreate("Gpu")->error("Cannot retrieve properties for GPU {}", gpuId);
throw std::exception();
}
std::stringstream name;
name << "gpu" << gpuId << "(" << deviceProp.name << ")";
return name.str();
}
bool Gpu::registerIoMemory(const MemoryBlock& mem)
{
auto& mm = MemoryManager::get();
const auto pciAddrSpaceId = mm.getPciAddressSpace();
// Check if we need to map anything at all, maybe it's already reachable
try {
// TODO: there might already be a path through the graph, but there's no
// overlapping window, so this will fail badly!
auto translation = mm.getTranslation(masterPciEAddrSpaceId,
mem.getAddrSpaceId());
if(translation.getSize() >= mem.getSize()) {
// there is already a sufficient path
logger->debug("Already mapped through another mapping");
return true;
} else {
logger->warn("There's already a mapping, but too small");
}
} catch(const std::out_of_range&) {
// not yet reachable, that's okay, proceed
}
// In order to register IO memory with CUDA, it has to be mapped to the VA
// space of the current process (requirement of CUDA API). Check this now.
MemoryManager::AddressSpaceId mappedBaseAddrSpaceId;
try {
auto path = mm.findPath(mm.getProcessAddressSpace(), mem.getAddrSpaceId());
// first node in path is the mapped memory space whose virtual address
// we need to hand to CUDA
mappedBaseAddrSpaceId = path.front();
} catch (const std::out_of_range&) {
logger->error("Memory not reachable from process, but required by CUDA");
return false;
}
// determine the base address of the mapped memory region needed by CUDA
const auto translationProcess = mm.getTranslationFromProcess(mappedBaseAddrSpaceId);
const uintptr_t baseAddrForProcess = translationProcess.getLocalAddr(0);
// Now check that the memory is also reachable via PCIe bus, otherwise GPU
// has no means to access it.
uintptr_t baseAddrOnPci;
size_t sizeOnPci;
try {
auto translationPci = mm.getTranslation(pciAddrSpaceId,
mappedBaseAddrSpaceId);
baseAddrOnPci = translationPci.getLocalAddr(0);
sizeOnPci = translationPci.getSize();
} catch(const std::out_of_range&) {
logger->error("Memory is not reachable via PCIe bus");
return false;
}
if(sizeOnPci < mem.getSize()) {
logger->warn("VA mapping of IO memory is too small: {:#x} instead of {:#x} bytes",
sizeOnPci, mem.getSize());
logger->warn("If something later on fails or behaves strangely, this might be the cause!");
}
cudaSetDevice(gpuId);
auto baseAddrVA = reinterpret_cast<void*>(baseAddrForProcess);
if(cudaHostRegister(baseAddrVA, sizeOnPci, cudaHostRegisterIoMemory) != cudaSuccess) {
logger->error("Cannot register IO memory for block {}", mem.getAddrSpaceId());
return false;
}
void* devicePointer = nullptr;
if(cudaHostGetDevicePointer(&devicePointer, baseAddrVA, 0) != cudaSuccess) {
logger->error("Cannot retrieve device pointer for IO memory");
return false;
}
mm.createMapping(reinterpret_cast<uintptr_t>(devicePointer), baseAddrOnPci,
sizeOnPci, "CudaIoMem", masterPciEAddrSpaceId, pciAddrSpaceId);
return true;
}
bool
Gpu::registerHostMemory(const MemoryBlock& mem)
{
auto& mm = MemoryManager::get();
auto translation = mm.getTranslationFromProcess(mem.getAddrSpaceId());
auto localBase = reinterpret_cast<void*>(translation.getLocalAddr(0));
int ret = cudaHostRegister(localBase, mem.getSize(), 0);
if(ret != cudaSuccess) {
logger->error("Cannot register memory block {} addr={:p} size={:#x} to CUDA: ret={}",
mem.getAddrSpaceId(), localBase, mem.getSize(), ret);
return false;
}
void* devicePointer = nullptr;
ret = cudaHostGetDevicePointer(&devicePointer, localBase, 0);
if(ret != cudaSuccess) {
logger->error("Cannot retrieve device pointer for IO memory: ret={}", ret);
return false;
}
mm.createMapping(reinterpret_cast<uintptr_t>(devicePointer), 0, mem.getSize(),
"CudaHostMem", masterPciEAddrSpaceId, mem.getAddrSpaceId());
return true;
}
bool Gpu::makeAccessibleToPCIeAndVA(const MemoryBlock& mem)
{
if(pImpl->gdr == nullptr) {
logger->warn("GDRcopy not available");
return false;
}
auto& mm = MemoryManager::get();
try {
auto path = mm.findPath(masterPciEAddrSpaceId, mem.getAddrSpaceId());
// if first hop is the PCIe bus, we know that memory is off-GPU
if(path.front() == mm.getPciAddressSpace()) {
throw std::out_of_range("Memory block is outside of this GPU");
}
} catch (const std::out_of_range&) {
logger->error("Trying to map non-GPU memory block");
return false;
}
logger->debug("retrieve complete device pointer from point of view of GPU");
// retrieve complete device pointer from point of view of GPU
auto translation = mm.getTranslation(masterPciEAddrSpaceId,
mem.getAddrSpaceId());
CUdeviceptr devptr = translation.getLocalAddr(0);
int ret;
// required to set this flag before mapping
unsigned int enable = 1;
ret = cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, devptr);
if(ret != CUDA_SUCCESS) {
logger->error("Cannot set pointer attributes on memory block {}: {}",
mem.getAddrSpaceId(), ret);
return false;
}
gdr_mh_t mh;
ret = gdr_pin_buffer(pImpl->gdr, devptr, mem.getSize(), 0, 0, &mh);
if(ret != 0) {
logger->error("Cannot pin memory block {} via gdrcopy: {}",
mem.getAddrSpaceId(), ret);
return false;
}
void* bar = nullptr;
ret = gdr_map(pImpl->gdr, mh, &bar, mem.getSize());
if(ret != 0) {
logger->error("Cannot map memory block {} via gdrcopy: {}",
mem.getAddrSpaceId(), ret);
return false;
}
gdr_info_t info;
ret = gdr_get_info(pImpl->gdr, mh, &info);
if(ret != 0) {
logger->error("Cannot get info for mapping of memory block {}: {}",
mem.getAddrSpaceId(), ret);
return false;
}
const uintptr_t offset = info.va - devptr;
const uintptr_t userPtr = reinterpret_cast<uintptr_t>(bar) + offset;
logger->debug("BAR ptr: {:p}", bar);
logger->debug("info.va: {:#x}", info.va);
logger->debug("info.mapped_size: {:#x}", info.mapped_size);
logger->debug("info.page_size: {:#x}", info.page_size);
logger->debug("offset: {:#x}", offset);
logger->debug("user pointer: {:#x}", userPtr);
// mapping to acceses memory block from process
mm.createMapping(userPtr, 0, info.mapped_size, "GDRcopy",
mm.getProcessAddressSpace(), mem.getAddrSpaceId());
// retrieve bus address
uint64_t addr[8];
ret = gdr_map_dma(pImpl->gdr, mh, 3, 0, 0, addr, 8);
for(int i = 0; i < ret; i++) {
logger->debug("DMA addr[{}]: {:#x}", i, addr[i]);
}
if(ret != 1) {
logger->error("Only one DMA address per block supported at the moment");
return false;
}
// mapping to access memory block from peer devices via PCIe
mm.createMapping(addr[0], 0, mem.getSize(), "GDRcopyDMA",
mm.getPciAddressSpace(), mem.getAddrSpaceId());
return true;
}
bool
Gpu::makeAccessibleFromPCIeOrHostRam(const MemoryBlock& mem)
{
// Check which kind of memory this is and where it resides
// There are two possibilities:
// - Host memory not managed by CUDA
// - IO memory somewhere on the PCIe bus
auto& mm = MemoryManager::get();
bool isIoMemory = false;
try {
auto path = mm.findPath(mm.getPciAddressSpace(), mem.getAddrSpaceId());
isIoMemory = true;
} catch(const std::out_of_range&) {
// not reachable via PCI -> not IO memory
}
if(isIoMemory) {
logger->debug("Memory block {} is assumed to be IO memory",
mem.getAddrSpaceId());
return registerIoMemory(mem);
} else {
logger->debug("Memory block {} is assumed to be non-CUDA host memory",
mem.getAddrSpaceId());
return registerHostMemory(mem);
}
}
void Gpu::memcpySync(const MemoryBlock& src, const MemoryBlock& dst, size_t size)
{
auto& mm = MemoryManager::get();
auto src_translation = mm.getTranslation(masterPciEAddrSpaceId,
src.getAddrSpaceId());
const void* src_buf = reinterpret_cast<void*>(src_translation.getLocalAddr(0));
auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId,
dst.getAddrSpaceId());
void* dst_buf = reinterpret_cast<void*>(dst_translation.getLocalAddr(0));
cudaSetDevice(gpuId);
cudaMemcpy(dst_buf, src_buf, size, cudaMemcpyDefault);
}
void Gpu::memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size)
{
auto& mm = MemoryManager::get();
auto src_translation = mm.getTranslation(masterPciEAddrSpaceId,
src.getAddrSpaceId());
auto src_buf = reinterpret_cast<uint8_t*>(src_translation.getLocalAddr(0));
auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId,
dst.getAddrSpaceId());
auto dst_buf = reinterpret_cast<uint8_t*>(dst_translation.getLocalAddr(0));
cudaSetDevice(gpuId);
kernel_memcpy<<<1, 1>>>(dst_buf, src_buf, size);
cudaDeviceSynchronize();
}
MemoryTranslation
Gpu::translate(const MemoryBlock& dst)
{
auto& mm = MemoryManager::get();
return mm.getTranslation(masterPciEAddrSpaceId, dst.getAddrSpaceId());
}
std::unique_ptr<villas::MemoryBlock, villas::MemoryBlock::deallocator_fn>
GpuAllocator::allocateBlock(size_t size)
{
cudaSetDevice(gpu.gpuId);
void* addr;
auto& mm = MemoryManager::get();
// search for an existing chunk that has enough free memory
auto chunk = std::find_if(chunks.begin(), chunks.end(), [&](const auto& chunk) {
return chunk->getAvailableMemory() >= size;
});
if(chunk != chunks.end()) {
logger->debug("Found existing chunk that can host the requested block");
return (*chunk)->allocateBlock(size);
} else {
// allocate a new chunk
// rounded-up multiple of GPU page size
const size_t chunkSize = size - (size & (GpuPageSize - 1)) + GpuPageSize;
logger->debug("Allocate new chunk of {:#x} bytes", chunkSize);
if(cudaSuccess != cudaMalloc(&addr, chunkSize)) {
logger->error("cudaMalloc(..., size={}) failed", chunkSize);
throw std::bad_alloc();
}
// assemble name for this block
std::stringstream name;
name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
const auto localAddr = reinterpret_cast<uintptr_t>(addr);
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
mem(new MemoryBlock(localAddr, chunkSize, blockAddrSpaceId), this->free);
insertMemoryBlock(*mem);
// already make accessible to CPU
gpu.makeAccessibleToPCIeAndVA(*mem);
// create a new allocator to manage the chunk and push to chunk list
chunks.push_front(std::make_unique<LinearAllocator>(std::move(mem)));
// call again, this time there's a large enough chunk
return allocateBlock(size);
}
}
Gpu::Gpu(int gpuId) :
pImpl{std::make_unique<impl>()},
gpuId(gpuId)
{
logger = loggerGetOrCreate(getName());
pImpl->gdr = gdr_open();
if(pImpl->gdr == nullptr) {
logger->warn("No GDRcopy support enabled, cannot open /dev/gdrdrv");
}
}
bool Gpu::init()
{
auto& mm = MemoryManager::get();
const auto gpuPciEAddrSpaceName = mm.getMasterAddrSpaceName(getName(), "PCIe");
masterPciEAddrSpaceId = mm.getOrCreateAddressSpace(gpuPciEAddrSpaceName);
allocator = std::make_unique<GpuAllocator>(*this);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, gpuId);
pImpl->pdev.slot = {
deviceProp.pciDomainID,
deviceProp.pciBusID,
deviceProp.pciDeviceID,
0};
struct pci_region* pci_regions = nullptr;
const size_t pci_num_regions = pci_get_regions(&pImpl->pdev, &pci_regions);
for(size_t i = 0; i < pci_num_regions; i++) {
const size_t region_size = pci_regions[i].end - pci_regions[i].start + 1;
logger->info("BAR{}: bus addr={:#x} size={:#x}",
pci_regions[i].num, pci_regions[i].start, region_size);
char name[] = "BARx";
name[3] = '0' + pci_regions[i].num;
auto gpuBarXAddrSpaceName = mm.getSlaveAddrSpaceName(getName(), name);
auto gpuBarXAddrSpaceId = mm.getOrCreateAddressSpace(gpuBarXAddrSpaceName);
mm.createMapping(pci_regions[i].start, 0, region_size,
std::string("PCI-") + name,
mm.getPciAddressSpace(), gpuBarXAddrSpaceId);
}
free(pci_regions);
return true;
}
std::list<std::unique_ptr<Gpu>>
GpuFactory::make()
{
int deviceCount = 0;
cudaGetDeviceCount(&deviceCount);
std::list<std::unique_ptr<Gpu>> gpuList;
for(int gpuId = 0; gpuId < deviceCount; gpuId++) {
if(cudaSetDevice(gpuId) != cudaSuccess) {
logger->warn("Cannot activate GPU {}", gpuId);
continue;
}
auto gpu = std::make_unique<Gpu>(gpuId);
if(not gpu->init()) {
logger->warn("Cannot initialize GPU {}", gpuId);
continue;
}
gpuList.emplace_back(std::move(gpu));
}
logger->info("Initialized {} GPUs", gpuList.size());
for(auto& gpu : gpuList) {
logger->debug(" - {}", gpu->getName());
}
return gpuList;
}
} // namespace villas
} // namespace gpu