/* GPU managment. * * Author: Daniel Krebs * SPDX-FileCopyrightText: 2017 Institute for Automation of Complex Power Systems, RWTH Aachen University * SPDX-License-Identifier: Apache-2.0 */ #include #include #include #include #include #include #include #include #include #include #include #include #include "kernels.hpp" using namespace villas::gpu; static GpuFactory gpuFactory; GpuAllocator::GpuAllocator(Gpu &gpu) : BaseAllocator(gpu.masterPciEAddrSpaceId), gpu(gpu) { free = [&](MemoryBlock *mem) { cudaSetDevice(gpu.gpuId); if (cudaFree(reinterpret_cast(mem->getOffset())) != cudaSuccess) { logger->warn("cudaFree() failed for {:#x} of size {:#x}", mem->getOffset(), mem->getSize()); } removeMemoryBlock(*mem); }; } std::string villas::gpu::GpuAllocator::getName() const { std::stringstream name; name << "GpuAlloc" << getAddrSpaceId(); return name.str(); } GpuFactory::GpuFactory() : logger(villas::Log::get("gpu:factory")), Plugin("cuda", "CUDA capable GPUs") {} // Required to be defined here for PIMPL to compile Gpu::~Gpu() { auto &mm = MemoryManager::get(); mm.removeAddressSpace(masterPciEAddrSpaceId); } // We use PIMPL in order to hide gdrcopy types from the public header class Gpu::impl { public: gdr_t gdr; struct pci_device pdev; }; std::string Gpu::getName() const { cudaDeviceProp deviceProp; if (cudaGetDeviceProperties(&deviceProp, gpuId) != cudaSuccess) { // Logger not yet availabe villas::Log::get("gpu")->error("Cannot retrieve properties for GPU {}", gpuId); throw std::exception(); } std::stringstream name; name << "gpu" << gpuId << "(" << deviceProp.name << ")"; return name.str(); } bool Gpu::registerIoMemory(const MemoryBlock &mem) { auto &mm = MemoryManager::get(); const auto pciAddrSpaceId = mm.getPciAddressSpace(); // Check if we need to map anything at all, maybe it's already reachable try { // TODO: there might already be a path through the graph, but there's no // overlapping window, so this will fail badly! auto translation = mm.getTranslation(masterPciEAddrSpaceId, mem.getAddrSpaceId()); if (translation.getSize() >= mem.getSize()) // There is already a sufficient path logger->debug("Already mapped through another mapping"); return true; else logger->warn("There's already a mapping, but too small"); } catch (const std::out_of_range &) { // Not yet reachable, that's okay, proceed } // In order to register IO memory with CUDA, it has to be mapped to the VA // space of the current process (requirement of CUDA API). Check this now. MemoryManager::AddressSpaceId mappedBaseAddrSpaceId; try { auto path = mm.findPath(mm.getProcessAddressSpace(), mem.getAddrSpaceId()); // First node in path is the mapped memory space whose virtual address // we need to hand to CUDA mappedBaseAddrSpaceId = path.front(); } catch (const std::out_of_range &) { logger->error("Memory not reachable from process, but required by CUDA"); return false; } // Determine the base address of the mapped memory region needed by CUDA const auto translationProcess = mm.getTranslationFromProcess(mappedBaseAddrSpaceId); const uintptr_t baseAddrForProcess = translationProcess.getLocalAddr(0); // Now check that the memory is also reachable via PCIe bus, otherwise GPU // has no means to access it. uintptr_t baseAddrOnPci; size_t sizeOnPci; try { auto translationPci = mm.getTranslation(pciAddrSpaceId, mappedBaseAddrSpaceId); baseAddrOnPci = translationPci.getLocalAddr(0); sizeOnPci = translationPci.getSize(); } catch (const std::out_of_range &) { logger->error("Memory is not reachable via PCIe bus"); return false; } if (sizeOnPci < mem.getSize()) { logger->warn( "VA mapping of IO memory is too small: {:#x} instead of {:#x} bytes", sizeOnPci, mem.getSize()); logger->warn("If something later on fails or behaves strangely, this might " "be the cause!"); } cudaSetDevice(gpuId); auto baseAddrVA = reinterpret_cast(baseAddrForProcess); if (cudaHostRegister(baseAddrVA, sizeOnPci, cudaHostRegisterIoMemory) != cudaSuccess) { logger->error("Cannot register IO memory for block {}", mem.getAddrSpaceId()); return false; } void *devicePointer = nullptr; if (cudaHostGetDevicePointer(&devicePointer, baseAddrVA, 0) != cudaSuccess) { logger->error("Cannot retrieve device pointer for IO memory"); return false; } mm.createMapping(reinterpret_cast(devicePointer), baseAddrOnPci, sizeOnPci, "CudaIoMem", masterPciEAddrSpaceId, pciAddrSpaceId); return true; } bool Gpu::registerHostMemory(const MemoryBlock &mem) { auto &mm = MemoryManager::get(); auto translation = mm.getTranslationFromProcess(mem.getAddrSpaceId()); auto localBase = reinterpret_cast(translation.getLocalAddr(0)); int ret = cudaHostRegister(localBase, mem.getSize(), 0); if (ret != cudaSuccess) { logger->error( "Cannot register memory block {} addr={:p} size={:#x} to CUDA: ret={}", mem.getAddrSpaceId(), localBase, mem.getSize(), ret); return false; } void *devicePointer = nullptr; ret = cudaHostGetDevicePointer(&devicePointer, localBase, 0); if (ret != cudaSuccess) { logger->error("Cannot retrieve device pointer for IO memory: ret={}", ret); return false; } mm.createMapping(reinterpret_cast(devicePointer), 0, mem.getSize(), "CudaHostMem", masterPciEAddrSpaceId, mem.getAddrSpaceId()); return true; } bool Gpu::makeAccessibleToPCIeAndVA(const MemoryBlock &mem) { if (pImpl->gdr == nullptr) { logger->warn("GDRcopy not available"); return false; } auto &mm = MemoryManager::get(); try { auto path = mm.findPath(masterPciEAddrSpaceId, mem.getAddrSpaceId()); // If first hop is the PCIe bus, we know that memory is off-GPU if (path.front() == mm.getPciAddressSpace()) throw std::out_of_range("Memory block is outside of this GPU"); } catch (const std::out_of_range &) { logger->error("Trying to map non-GPU memory block"); return false; } logger->debug("retrieve complete device pointer from point of view of GPU"); // Retrieve complete device pointer from point of view of GPU auto translation = mm.getTranslation(masterPciEAddrSpaceId, mem.getAddrSpaceId()); CUdeviceptr devptr = translation.getLocalAddr(0); int ret; // Required to set this flag before mapping unsigned int enable = 1; ret = cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, devptr); if (ret != CUDA_SUCCESS) { logger->error("Cannot set pointer attributes on memory block {}: {}", mem.getAddrSpaceId(), ret); return false; } gdr_mh_t mh; ret = gdr_pin_buffer(pImpl->gdr, devptr, mem.getSize(), 0, 0, &mh); if (ret != 0) { logger->error("Cannot pin memory block {} via gdrcopy: {}", mem.getAddrSpaceId(), ret); return false; } void *bar = nullptr; ret = gdr_map(pImpl->gdr, mh, &bar, mem.getSize()); if (ret != 0) { logger->error("Cannot map memory block {} via gdrcopy: {}", mem.getAddrSpaceId(), ret); return false; } gdr_info_t info; ret = gdr_get_info(pImpl->gdr, mh, &info); if (ret != 0) { logger->error("Cannot get info for mapping of memory block {}: {}", mem.getAddrSpaceId(), ret); return false; } const uintptr_t offset = info.va - devptr; const uintptr_t userPtr = reinterpret_cast(bar) + offset; logger->debug("BAR ptr: {:p}", bar); logger->debug("info.va: {:#x}", info.va); logger->debug("info.mapped_size: {:#x}", info.mapped_size); logger->debug("info.page_size: {:#x}", info.page_size); logger->debug("offset: {:#x}", offset); logger->debug("user pointer: {:#x}", userPtr); // Mapping to acceses memory block from process mm.createMapping(userPtr, 0, info.mapped_size, "GDRcopy", mm.getProcessAddressSpace(), mem.getAddrSpaceId()); // Retrieve bus address uint64_t addr[8]; ret = gdr_map_dma(pImpl->gdr, mh, 3, 0, 0, addr, 8); for (int i = 0; i < ret; i++) logger->debug("DMA addr[{}]: {:#x}", i, addr[i]); if (ret != 1) { logger->error("Only one DMA address per block supported at the moment"); return false; } // Mapping to access memory block from peer devices via PCIe mm.createMapping(addr[0], 0, mem.getSize(), "GDRcopyDMA", mm.getPciAddressSpace(), mem.getAddrSpaceId()); return true; } bool Gpu::makeAccessibleFromPCIeOrHostRam(const MemoryBlock &mem) { // Check which kind of memory this is and where it resides // There are two possibilities: // - Host memory not managed by CUDA // - IO memory somewhere on the PCIe bus auto &mm = MemoryManager::get(); bool isIoMemory = false; try { auto path = mm.findPath(mm.getPciAddressSpace(), mem.getAddrSpaceId()); isIoMemory = true; } catch (const std::out_of_range &) { // Not reachable via PCI -> not IO memory } if (isIoMemory) { logger->debug("Memory block {} is assumed to be IO memory", mem.getAddrSpaceId()); return registerIoMemory(mem); } else { logger->debug("Memory block {} is assumed to be non-CUDA host memory", mem.getAddrSpaceId()); return registerHostMemory(mem); } } void Gpu::memcpySync(const MemoryBlock &src, const MemoryBlock &dst, size_t size) { auto &mm = MemoryManager::get(); auto src_translation = mm.getTranslation(masterPciEAddrSpaceId, src.getAddrSpaceId()); const void *src_buf = reinterpret_cast(src_translation.getLocalAddr(0)); auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId, dst.getAddrSpaceId()); void *dst_buf = reinterpret_cast(dst_translation.getLocalAddr(0)); cudaSetDevice(gpuId); cudaMemcpy(dst_buf, src_buf, size, cudaMemcpyDefault); } void Gpu::memcpyKernel(const MemoryBlock &src, const MemoryBlock &dst, size_t size) { auto &mm = MemoryManager::get(); auto src_translation = mm.getTranslation(masterPciEAddrSpaceId, src.getAddrSpaceId()); auto src_buf = reinterpret_cast(src_translation.getLocalAddr(0)); auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId, dst.getAddrSpaceId()); auto dst_buf = reinterpret_cast(dst_translation.getLocalAddr(0)); cudaSetDevice(gpuId); kernel_memcpy<<<1, 1>>>(dst_buf, src_buf, size); cudaDeviceSynchronize(); } MemoryTranslation Gpu::translate(const MemoryBlock &dst) { auto &mm = MemoryManager::get(); return mm.getTranslation(masterPciEAddrSpaceId, dst.getAddrSpaceId()); } std::unique_ptr GpuAllocator::allocateBlock(size_t size) { cudaSetDevice(gpu.gpuId); void *addr; auto &mm = MemoryManager::get(); // Search for an existing chunk that has enough free memory auto chunk = std::find_if(chunks.begin(), chunks.end(), [&](const auto &chunk) { return chunk->getAvailableMemory() >= size; }); if (chunk != chunks.end()) { logger->debug("Found existing chunk that can host the requested block"); return (*chunk)->allocateBlock(size); } else { // Allocate a new chunk // Rounded-up multiple of GPU page size const size_t chunkSize = size - (size & (GpuPageSize - 1)) + GpuPageSize; logger->debug("Allocate new chunk of {:#x} bytes", chunkSize); if (cudaSuccess != cudaMalloc(&addr, chunkSize)) { logger->error("cudaMalloc(..., size={}) failed", chunkSize); throw std::bad_alloc(); } // Assemble name for this block std::stringstream name; name << std::showbase << std::hex << reinterpret_cast(addr); auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str()); auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName); const auto localAddr = reinterpret_cast(addr); std::unique_ptr mem( new MemoryBlock(localAddr, chunkSize, blockAddrSpaceId), this->free); insertMemoryBlock(*mem); // Already make accessible to CPU gpu.makeAccessibleToPCIeAndVA(*mem); // Create a new allocator to manage the chunk and push to chunk list chunks.push_front(std::make_unique(std::move(mem))); // Call again, this time there's a large enough chunk return allocateBlock(size); } } Gpu::Gpu(int gpuId) : pImpl{std::make_unique()}, gpuId(gpuId) { logger = villas::Log::get(getName()); pImpl->gdr = gdr_open(); if (pImpl->gdr == nullptr) { logger->warn("No GDRcopy support enabled, cannot open /dev/gdrdrv"); } } bool Gpu::init() { auto &mm = MemoryManager::get(); const auto gpuPciEAddrSpaceName = mm.getMasterAddrSpaceName(getName(), "pcie"); masterPciEAddrSpaceId = mm.getOrCreateAddressSpace(gpuPciEAddrSpaceName); allocator = std::make_unique(*this); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, gpuId); pImpl->pdev.slot = {deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID, 0}; struct pci_region *pci_regions = nullptr; const size_t pci_num_regions = pci_get_regions(&pImpl->pdev, &pci_regions); for (size_t i = 0; i < pci_num_regions; i++) { const size_t region_size = pci_regions[i].end - pci_regions[i].start + 1; logger->info("BAR{}: bus addr={:#x} size={:#x}", pci_regions[i].num, pci_regions[i].start, region_size); char name[] = "BARx"; name[3] = '0' + pci_regions[i].num; auto gpuBarXAddrSpaceName = mm.getSlaveAddrSpaceName(getName(), name); auto gpuBarXAddrSpaceId = mm.getOrCreateAddressSpace(gpuBarXAddrSpaceName); mm.createMapping(pci_regions[i].start, 0, region_size, std::string("PCI-") + name, mm.getPciAddressSpace(), gpuBarXAddrSpaceId); } free(pci_regions); return true; } std::list> GpuFactory::make() { int deviceCount = 0; cudaGetDeviceCount(&deviceCount); std::list> gpuList; for (int gpuId = 0; gpuId < deviceCount; gpuId++) { if (cudaSetDevice(gpuId) != cudaSuccess) { logger->warn("Cannot activate GPU {}", gpuId); continue; } auto gpu = std::make_unique(gpuId); if (not gpu->init()) { logger->warn("Cannot initialize GPU {}", gpuId); continue; } gpuList.emplace_back(std::move(gpu)); } logger->info("Initialized {} GPUs", gpuList.size()); for (auto &gpu : gpuList) { logger->debug(" - {}", gpu->getName()); } return gpuList; }