From 13fd3f3c2a825d48e216eeba8d32ef6f53b0c00a Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 17:35:45 +0200
Subject: [PATCH] gpu: implement basic GPU plugin that can do DMA to and from
 its memory

Using CUDA, memory can be allocated on the GPU and shared to peers on
the PCIe bus such as the FPGA. Furthermore, the DMA on the GPU can also
be used to read and write to/from other memory on the PCIe bus, such as
BRAM on the FPGA.
---
 fpga/include/villas/plugin.hpp      |   1 +
 fpga/lib/gpu/CMakeLists.txt         |  24 +-
 fpga/lib/gpu/gdrcopy                |   2 +-
 fpga/lib/gpu/gpu.cpp                | 467 ++++++++++++++++++++++++++++
 fpga/lib/gpu/include/villas/gpu.hpp |  80 +++++
 fpga/lib/gpu/kernels.cu             |  42 +++
 fpga/lib/gpu/kernels.hpp            |  16 +
 fpga/tests/CMakeLists.txt           |   1 +
 fpga/tests/gpu.cpp                  | 129 ++++++++
 9 files changed, 755 insertions(+), 7 deletions(-)
 create mode 100644 fpga/lib/gpu/kernels.cu
 create mode 100644 fpga/lib/gpu/kernels.hpp
 create mode 100644 fpga/tests/gpu.cpp
diff --git a/fpga/include/villas/plugin.hpp b/fpga/include/villas/plugin.hpp
index 35a4912ba..fd3fd3fc8 100644
--- a/fpga/include/villas/plugin.hpp
+++ b/fpga/include/villas/plugin.hpp
@@ -40,6 +40,7 @@ public:
 		Unknown,
 		FpgaIp,
 		FpgaCard,
+		Gpu
 	};
 
 	Plugin(Type type, const std::string& name);
diff --git a/fpga/lib/gpu/CMakeLists.txt b/fpga/lib/gpu/CMakeLists.txt
index 68c3993ea..c2dd78804 100644
--- a/fpga/lib/gpu/CMakeLists.txt
+++ b/fpga/lib/gpu/CMakeLists.txt
@@ -1,14 +1,26 @@
-cmake_minimum_required(VERSION 3.7)
+cmake_minimum_required(VERSION 3.8)
 
 project(villas-gpu
 	VERSION 1.0
 	DESCRIPTION "VILLASgpu"
-	LANGUAGES C CXX)
+	LANGUAGES C CXX CUDA)
 
-# fail if CUDA not found
-find_package(CUDA QUIET REQUIRED)
+add_subdirectory(gdrcopy)
 
-cuda_add_library(villas-gpu SHARED gpu.cpp)
+add_library(villas-gpu SHARED gpu.cpp kernels.cu)
+
+target_compile_options(villas-gpu PRIVATE -g)
+
+set_source_files_properties(gpu.cpp PROPERTIES
+	LANGUAGE CUDA)
+
+target_include_directories(villas-gpu PRIVATE /opt/cuda/include)
+
+target_link_libraries(villas-gpu
+	PRIVATE villas-common gdrapi cuda)
 
 target_include_directories(villas-gpu
-	PUBLIC ${CMAKE_CURRENT_LIST_DIR}/include)
+	PUBLIC
+	    ${CMAKE_CURRENT_LIST_DIR}/include
+	PRIVATE
+	    ${CMAKE_CURRENT_LIST_DIR})
diff --git a/fpga/lib/gpu/gdrcopy b/fpga/lib/gpu/gdrcopy
index 2b933176d..0441daa44 160000
--- a/fpga/lib/gpu/gdrcopy
+++ b/fpga/lib/gpu/gdrcopy
@@ -1 +1 @@
-Subproject commit 2b933176d0fd20f10bddfdf574a1d3229ca1ecdf
+Subproject commit 0441daa447b80260c4e11096f03e88f7be08bfa2
diff --git a/fpga/lib/gpu/gpu.cpp b/fpga/lib/gpu/gpu.cpp
index 4314fcb65..e8f7d58ec 100644
--- a/fpga/lib/gpu/gpu.cpp
+++ b/fpga/lib/gpu/gpu.cpp
@@ -1,7 +1,474 @@
+#include <cstdio>
+#include <cstdint>
+#include <sys/mman.h>
+
+#include <memory>
+#include <algorithm>
+
 #include <villas/gpu.hpp>
+#include <villas/log.hpp>
+#include <villas/kernel/pci.h>
+#include <memory_manager.hpp>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <gdrapi.h>
+
+#include "kernels.hpp"
 
 namespace villas {
 namespace gpu {
 
+static GpuFactory gpuFactory;
+
+GpuAllocator::GpuAllocator(Gpu& gpu) :
+    BaseAllocator(gpu.masterPciEAddrSpaceId),
+    gpu(gpu)
+{
+	free = [&](MemoryBlock* mem) {
+		cudaSetDevice(gpu.gpuId);
+		if(cudaFree(reinterpret_cast<void*>(mem->getOffset())) != cudaSuccess) {
+			logger->warn("cudaFree() failed for {:#x} of size {:#x}",
+			             mem->getOffset(), mem->getSize());
+		}
+
+		removeMemoryBlock(*mem);
+	};
+}
+
+std::string
+villas::gpu::GpuAllocator::getName() const
+{
+	std::stringstream name;
+	name << "GpuAlloc" << getAddrSpaceId();
+	return name.str();
+}
+
+
+GpuFactory::GpuFactory() :
+    Plugin(Plugin::Type::Gpu, "GPU")
+{
+	logger = loggerGetOrCreate("GpuFactory");
+}
+
+// required to be defined here for PIMPL to compile
+Gpu::~Gpu()
+{
+	auto& mm = MemoryManager::get();
+	mm.removeAddressSpace(masterPciEAddrSpaceId);
+}
+
+
+// we use PIMPL in order to hide gdrcopy types from the public header
+class Gpu::impl {
+public:
+	gdr_t gdr;
+	struct pci_device pdev;
+};
+
+std::string Gpu::getName() const
+{
+	cudaDeviceProp deviceProp;
+	if(cudaGetDeviceProperties(&deviceProp, gpuId) != cudaSuccess) {
+		// logger not yet availabe
+		loggerGetOrCreate("Gpu")->error("Cannot retrieve properties for GPU {}", gpuId);
+		throw std::exception();
+	}
+
+	std::stringstream name;
+	name << "gpu" << gpuId << "(" << deviceProp.name << ")";
+
+	return name.str();
+}
+
+bool Gpu::registerIoMemory(const MemoryBlock& mem)
+{
+	auto& mm = MemoryManager::get();
+	const auto pciAddrSpaceId = mm.getPciAddressSpace();
+
+	// Check if we need to map anything at all, maybe it's already reachable
+	try {
+		// TODO: there might already be a path through the graph, but there's no
+		// overlapping window, so this will fail badly!
+		auto translation = mm.getTranslation(masterPciEAddrSpaceId,
+		                                     mem.getAddrSpaceId());
+		if(translation.getSize() >= mem.getSize()) {
+			// there is already a sufficient path
+			logger->debug("Already mapped through another mapping");
+			return true;
+		} else {
+			logger->warn("There's already a mapping, but too small");
+		}
+	} catch(const std::out_of_range&) {
+		// not yet reachable, that's okay, proceed
+	}
+
+
+	// In order to register IO memory with CUDA, it has to be mapped to the VA
+	// space of the current process (requirement of CUDA API). Check this now.
+	MemoryManager::AddressSpaceId mappedBaseAddrSpaceId;
+	try {
+		auto path = mm.findPath(mm.getProcessAddressSpace(), mem.getAddrSpaceId());
+		// first node in path is the mapped memory space whose virtual address
+		// we need to hand to CUDA
+		mappedBaseAddrSpaceId = path.front();
+	} catch (const std::out_of_range&) {
+		logger->error("Memory not reachable from process, but required by CUDA");
+		return false;
+	}
+
+	// determine the base address of the mapped memory region needed by CUDA
+	const auto translationProcess = mm.getTranslationFromProcess(mappedBaseAddrSpaceId);
+	const uintptr_t baseAddrForProcess = translationProcess.getLocalAddr(0);
+
+
+	// Now check that the memory is also reachable via PCIe bus, otherwise GPU
+	// has no means to access it.
+	uintptr_t baseAddrOnPci;
+	size_t	sizeOnPci;
+	try {
+		auto translationPci = mm.getTranslation(pciAddrSpaceId,
+		                                        mappedBaseAddrSpaceId);
+		baseAddrOnPci = translationPci.getLocalAddr(0);
+		sizeOnPci = translationPci.getSize();
+	} catch(const std::out_of_range&) {
+		logger->error("Memory is not reachable via PCIe bus");
+		return false;
+	}
+
+	if(sizeOnPci < mem.getSize()) {
+		logger->warn("VA mapping of IO memory is too small: {:#x} instead of {:#x} bytes",
+		              sizeOnPci, mem.getSize());
+		logger->warn("If something later on fails or behaves strangely, this might be the cause!");
+	}
+
+
+	cudaSetDevice(gpuId);
+
+	auto baseAddrVA = reinterpret_cast<void*>(baseAddrForProcess);
+	if(cudaHostRegister(baseAddrVA, sizeOnPci, cudaHostRegisterIoMemory) != cudaSuccess) {
+		logger->error("Cannot register IO memory for block {}", mem.getAddrSpaceId());
+		return false;
+	}
+
+	void* devicePointer = nullptr;
+	if(cudaHostGetDevicePointer(&devicePointer, baseAddrVA, 0) != cudaSuccess) {
+		logger->error("Cannot retrieve device pointer for IO memory");
+		return false;
+	}
+
+	mm.createMapping(reinterpret_cast<uintptr_t>(devicePointer), baseAddrOnPci,
+	                 sizeOnPci, "CudaIoMem", masterPciEAddrSpaceId, pciAddrSpaceId);
+
+	return true;
+}
+
+bool
+Gpu::registerHostMemory(const MemoryBlock& mem)
+{
+	auto& mm = MemoryManager::get();
+
+	auto translation = mm.getTranslationFromProcess(mem.getAddrSpaceId());
+	auto localBase = reinterpret_cast<void*>(translation.getLocalAddr(0));
+
+	int ret = cudaHostRegister(localBase, mem.getSize(), 0);
+	if(ret != cudaSuccess) {
+		logger->error("Cannot register memory block {} addr={:p} size={:#x} to CUDA: ret={}",
+		              mem.getAddrSpaceId(), localBase, mem.getSize(), ret);
+		return false;
+	}
+
+	void* devicePointer = nullptr;
+	ret = cudaHostGetDevicePointer(&devicePointer, localBase, 0);
+	if(ret != cudaSuccess) {
+		logger->error("Cannot retrieve device pointer for IO memory: ret={}", ret);
+		return false;
+	}
+
+	mm.createMapping(reinterpret_cast<uintptr_t>(devicePointer), 0, mem.getSize(),
+	                 "CudaHostMem", masterPciEAddrSpaceId, mem.getAddrSpaceId());
+
+	return true;
+}
+
+bool Gpu::makeAccessibleToPCIeAndVA(const MemoryBlock& mem)
+{
+	if(pImpl->gdr == nullptr) {
+		logger->warn("GDRcopy not available");
+		return false;
+	}
+
+	auto& mm = MemoryManager::get();
+
+	try {
+		auto path = mm.findPath(masterPciEAddrSpaceId, mem.getAddrSpaceId());
+		// if first hop is the PCIe bus, we know that memory is off-GPU
+		if(path.front() == mm.getPciAddressSpace()) {
+			throw std::out_of_range("Memory block is outside of this GPU");
+		}
+
+	} catch (const std::out_of_range&) {
+		logger->error("Trying to map non-GPU memory block");
+		return false;
+	}
+
+	logger->debug("retrieve complete device pointer from point of view of GPU");
+	// retrieve complete device pointer from point of view of GPU
+	auto translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                     mem.getAddrSpaceId());
+	CUdeviceptr devptr = translation.getLocalAddr(0);
+
+	int ret;
+
+	// required to set this flag before mapping
+	unsigned int enable = 1;
+	ret = cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, devptr);
+	if(ret != CUDA_SUCCESS) {
+		logger->error("Cannot set pointer attributes on memory block {}: {}",
+		              mem.getAddrSpaceId(), ret);
+		return false;
+	}
+
+	gdr_mh_t mh;
+	ret = gdr_pin_buffer(pImpl->gdr, devptr, mem.getSize(), 0, 0, &mh);
+	if(ret != 0) {
+		logger->error("Cannot pin memory block {} via gdrcopy: {}",
+		              mem.getAddrSpaceId(), ret);
+		return false;
+	}
+
+	void* bar = nullptr;
+	ret = gdr_map(pImpl->gdr, mh, &bar, mem.getSize());
+	if(ret != 0) {
+		logger->error("Cannot map memory block {} via gdrcopy: {}",
+		              mem.getAddrSpaceId(), ret);
+		return false;
+	}
+
+	gdr_info_t info;
+	ret = gdr_get_info(pImpl->gdr, mh, &info);
+	if(ret != 0) {
+		logger->error("Cannot get info for mapping of memory block {}: {}",
+		              mem.getAddrSpaceId(), ret);
+		return false;
+	}
+
+	const uintptr_t offset = info.va - devptr;
+	const uintptr_t userPtr = reinterpret_cast<uintptr_t>(bar) + offset;
+
+	logger->debug("BAR ptr:          {:p}",  bar);
+	logger->debug("info.va:          {:#x}", info.va);
+	logger->debug("info.mapped_size: {:#x}", info.mapped_size);
+	logger->debug("info.page_size:   {:#x}", info.page_size);
+	logger->debug("offset:           {:#x}", offset);
+	logger->debug("user pointer:     {:#x}", userPtr);
+
+	// mapping to acceses memory block from process
+	mm.createMapping(userPtr, 0, info.mapped_size, "GDRcopy",
+	                 mm.getProcessAddressSpace(), mem.getAddrSpaceId());
+
+	// retrieve bus address
+	uint64_t addr[8];
+	ret = gdr_map_dma(pImpl->gdr, mh, 3, 0, 0, addr, 8);
+
+	for(int i = 0; i < ret; i++) {
+		logger->debug("DMA addr[{}]:      {:#x}", i, addr[i]);
+	}
+
+	if(ret != 1) {
+		logger->error("Only one DMA address per block supported at the moment");
+		return false;
+	}
+
+	// mapping to access memory block from peer devices via PCIe
+	mm.createMapping(addr[0], 0, mem.getSize(), "GDRcopyDMA",
+	                 mm.getPciAddressSpace(), mem.getAddrSpaceId());
+
+	return true;
+}
+
+bool
+Gpu::makeAccessibleFromPCIeOrHostRam(const MemoryBlock& mem)
+{
+	// Check which kind of memory this is and where it resides
+	// There are two possibilities:
+	//  - Host memory not managed by CUDA
+	//  - IO memory somewhere on the PCIe bus
+
+	auto& mm = MemoryManager::get();
+
+	bool isIoMemory = false;
+	try {
+		auto path = mm.findPath(mm.getPciAddressSpace(), mem.getAddrSpaceId());
+		isIoMemory = true;
+	} catch(const std::out_of_range&) {
+		// not reachable via PCI -> not IO memory
+	}
+
+	if(isIoMemory) {
+		logger->debug("Memory block {} is assumed to be IO memory",
+		              mem.getAddrSpaceId());
+
+		return registerIoMemory(mem);
+	} else {
+		logger->debug("Memory block {} is assumed to be non-CUDA host memory",
+		              mem.getAddrSpaceId());
+
+		return registerHostMemory(mem);
+	}
+}
+
+void Gpu::memcpySync(const MemoryBlock& src, const MemoryBlock& dst, size_t size)
+{
+	auto& mm = MemoryManager::get();
+
+	auto src_translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                         src.getAddrSpaceId());
+	const void* src_buf = reinterpret_cast<void*>(src_translation.getLocalAddr(0));
+
+	auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                         dst.getAddrSpaceId());
+	void* dst_buf = reinterpret_cast<void*>(dst_translation.getLocalAddr(0));
+
+	cudaSetDevice(gpuId);
+	cudaMemcpy(dst_buf, src_buf, size, cudaMemcpyDefault);
+}
+
+void Gpu::memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size)
+{
+	auto& mm = MemoryManager::get();
+
+	auto src_translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                         src.getAddrSpaceId());
+	auto src_buf = reinterpret_cast<uint8_t*>(src_translation.getLocalAddr(0));
+
+	auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                         dst.getAddrSpaceId());
+	auto dst_buf = reinterpret_cast<uint8_t*>(dst_translation.getLocalAddr(0));
+
+	cudaSetDevice(gpuId);
+	kernel_memcpy<<<1, 1>>>(dst_buf, src_buf, size);
+	cudaDeviceSynchronize();
+}
+
+
+std::unique_ptr<villas::MemoryBlock, villas::MemoryBlock::deallocator_fn>
+GpuAllocator::allocateBlock(size_t size)
+{
+	cudaSetDevice(gpu.gpuId);
+
+	void* addr;
+	if(cudaSuccess != cudaMalloc(&addr, size)) {
+		logger->error("cudaMalloc(..., size={}) failed", size);
+		throw std::bad_alloc();
+	}
+
+	auto& mm = MemoryManager::get();
+
+	// assemble name for this block
+	std::stringstream name;
+	name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
+
+	auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
+	auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
+
+	const auto localAddr = reinterpret_cast<uintptr_t>(addr);
+	std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
+	        mem(new MemoryBlock(localAddr, size, blockAddrSpaceId), this->free);
+
+	insertMemoryBlock(*mem);
+
+	gpu.makeAccessibleToPCIeAndVA(*mem);
+
+	return mem;
+}
+
+
+Gpu::Gpu(int gpuId) :
+    pImpl{std::make_unique<impl>()},
+    gpuId(gpuId)
+{
+	logger = loggerGetOrCreate(getName());
+
+	pImpl->gdr = gdr_open();
+	if(pImpl->gdr == nullptr) {
+		logger->warn("No GDRcopy support enabled, cannot open /dev/gdrdrv");
+	}
+}
+
+bool Gpu::init()
+{
+	auto& mm = MemoryManager::get();
+
+	const auto gpuPciEAddrSpaceName = mm.getMasterAddrSpaceName(getName(), "PCIe");
+	masterPciEAddrSpaceId = mm.getOrCreateAddressSpace(gpuPciEAddrSpaceName);
+
+	allocator = std::make_unique<GpuAllocator>(*this);
+
+	cudaDeviceProp deviceProp;
+	cudaGetDeviceProperties(&deviceProp, gpuId);
+
+	pImpl->pdev.slot = {
+	    deviceProp.pciDomainID,
+	    deviceProp.pciBusID,
+	    deviceProp.pciDeviceID,
+	    0};
+
+	struct pci_region* pci_regions = nullptr;
+	const size_t pci_num_regions = pci_get_regions(&pImpl->pdev, &pci_regions);
+	for(size_t i = 0; i < pci_num_regions; i++) {
+		const size_t region_size = pci_regions[i].end - pci_regions[i].start + 1;
+		logger->info("BAR{}: bus addr={:#x} size={:#x}",
+		             pci_regions[i].num, pci_regions[i].start, region_size);
+
+		char name[] = "BARx";
+		name[3] = '0' + pci_regions[i].num;
+
+		auto gpuBarXAddrSpaceName = mm.getSlaveAddrSpaceName(getName(), name);
+		auto gpuBarXAddrSpaceId = mm.getOrCreateAddressSpace(gpuBarXAddrSpaceName);
+
+		mm.createMapping(pci_regions[i].start, 0, region_size,
+		                 std::string("PCI-") + name,
+		                 mm.getPciAddressSpace(), gpuBarXAddrSpaceId);
+	}
+
+	free(pci_regions);
+
+	return true;
+}
+
+
+std::list<std::unique_ptr<Gpu>>
+GpuFactory::make()
+{
+	int deviceCount = 0;
+	cudaGetDeviceCount(&deviceCount);
+
+	std::list<std::unique_ptr<Gpu>> gpuList;
+
+	for(int gpuId = 0; gpuId < deviceCount; gpuId++) {
+		if(cudaSetDevice(gpuId) != cudaSuccess) {
+			logger->warn("Cannot activate GPU {}", gpuId);
+			continue;
+		}
+
+		auto gpu = std::make_unique<Gpu>(gpuId);
+
+		if(not gpu->init()) {
+			logger->warn("Cannot initialize GPU {}", gpuId);
+			continue;
+		}
+
+		gpuList.emplace_back(std::move(gpu));
+	}
+
+	logger->info("Initialized {} GPUs", gpuList.size());
+	for(auto& gpu : gpuList) {
+		logger->debug(" - {}", gpu->getName());
+	}
+
+	return gpuList;
+}
+
 } // namespace villas
 } // namespace gpu
diff --git a/fpga/lib/gpu/include/villas/gpu.hpp b/fpga/lib/gpu/include/villas/gpu.hpp
index 903dd08c2..88b316815 100644
--- a/fpga/lib/gpu/include/villas/gpu.hpp
+++ b/fpga/lib/gpu/include/villas/gpu.hpp
@@ -1,7 +1,87 @@
 #pragma once
 
+#include <sstream>
+
+#include <plugin.hpp>
+#include <memory_manager.hpp>
+#include <memory.hpp>
+#include <villas/log.hpp>
+
+
 namespace villas {
 namespace gpu {
 
+class GpuAllocator;
+
+class Gpu {
+	friend GpuAllocator;
+public:
+	Gpu(int gpuId);
+	~Gpu();
+
+	bool init();
+
+	std::string getName() const;
+
+	GpuAllocator& getAllocator() const
+	{ return *allocator; }
+
+
+	bool makeAccessibleToPCIeAndVA(const MemoryBlock& mem);
+
+	/// Make some memory block accssible for this GPU
+	bool makeAccessibleFromPCIeOrHostRam(const MemoryBlock& mem);
+
+	void memcpySync(const MemoryBlock& src, const MemoryBlock& dst, size_t size);
+
+	void memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size);
+
+private:
+	bool registerIoMemory(const MemoryBlock& mem);
+	bool registerHostMemory(const MemoryBlock& mem);
+
+private:
+	class impl;
+	std::unique_ptr<impl> pImpl;
+
+	// master, will be used to derived slave addr spaces for allocation
+	MemoryManager::AddressSpaceId masterPciEAddrSpaceId;
+
+	MemoryManager::AddressSpaceId slaveMemoryAddrSpaceId;
+
+	SpdLogger logger;
+
+	int gpuId;
+
+	std::unique_ptr<GpuAllocator> allocator;
+};
+
+
+class GpuAllocator : public BaseAllocator<GpuAllocator> {
+public:
+	GpuAllocator(Gpu& gpu);
+
+	std::string getName() const;
+
+	std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
+	allocateBlock(size_t size);
+
+private:
+	Gpu& gpu;
+};
+
+class GpuFactory : public Plugin {
+public:
+	GpuFactory();
+
+	std::list<std::unique_ptr<Gpu>>
+	make();
+
+	void run(void*);
+
+private:
+	SpdLogger logger;
+};
+
 } // namespace villas
 } // namespace gpu
diff --git a/fpga/lib/gpu/kernels.cu b/fpga/lib/gpu/kernels.cu
new file mode 100644
index 000000000..7e8b5524d
--- /dev/null
+++ b/fpga/lib/gpu/kernels.cu
@@ -0,0 +1,42 @@
+#include <stdio.h>
+
+#include <villas/gpu.hpp>
+#include "kernels.hpp"
+
+#include "cuda_runtime.h"
+#include <cuda.h>
+
+namespace villas {
+namespace gpu {
+
+
+__global__ void
+kernel_mailbox(volatile uint32_t *mailbox, volatile uint32_t* counter)
+{
+	printf("[gpu] hello!\n");
+	printf("[gpu] mailbox: %p\n", mailbox);
+
+	printf("[kernel] started\n");
+
+	while(1) {
+		if (*mailbox == 1) {
+			*mailbox = 0;
+			printf("[gpu] counter = %d\n", *counter);
+			break;
+		}
+	}
+
+	printf("[gpu] quit\n");
+}
+
+__global__ void
+kernel_memcpy(volatile uint8_t* dst, volatile uint8_t* src, size_t length)
+{
+	while(length > 0) {
+		*dst++ = *src++;
+		length--;
+	}
+}
+
+} // namespace villas
+} // namespace gpu
diff --git a/fpga/lib/gpu/kernels.hpp b/fpga/lib/gpu/kernels.hpp
new file mode 100644
index 000000000..986eba31e
--- /dev/null
+++ b/fpga/lib/gpu/kernels.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <cstdint>
+#include <cuda_runtime.h>
+
+namespace villas {
+namespace gpu {
+
+__global__ void
+kernel_mailbox(volatile uint32_t *mailbox, volatile uint32_t* counter);
+
+__global__ void
+kernel_memcpy(volatile uint8_t* dst, volatile uint8_t* src, size_t length);
+
+} // namespace villas
+} // namespace gpu
diff --git a/fpga/tests/CMakeLists.txt b/fpga/tests/CMakeLists.txt
index 2e4cfe041..f36647ab3 100644
--- a/fpga/tests/CMakeLists.txt
+++ b/fpga/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ set(SOURCES
 	logging.cpp
 	dma.cpp
     fifo.cpp
+	gpu.cpp
 #	hls.c
 #	intc.c
 #	rtds_rtt.c
diff --git a/fpga/tests/gpu.cpp b/fpga/tests/gpu.cpp
new file mode 100644
index 000000000..bdc3aba92
--- /dev/null
+++ b/fpga/tests/gpu.cpp
@@ -0,0 +1,129 @@
+#include <criterion/criterion.h>
+
+#include <map>
+#include <string>
+
+#include <villas/log.hpp>
+#include <villas/fpga/card.hpp>
+#include <villas/fpga/ips/dma.hpp>
+#include <villas/fpga/ips/bram.hpp>
+
+#include <villas/utils.h>
+
+#include "global.hpp"
+
+#include <villas/memory.hpp>
+#include <villas/gpu.hpp>
+
+
+Test(fpga, gpu_dma, .description = "GPU DMA tests")
+{
+	auto logger = loggerGetOrCreate("unittest:dma");
+
+	auto& card = state.cards.front();
+
+	villas::Plugin* plugin = villas::Plugin::lookup(villas::Plugin::Type::Gpu, "");
+	auto gpuPlugin = dynamic_cast<villas::gpu::GpuFactory*>(plugin);
+	cr_assert_not_null(gpuPlugin, "No GPU plugin found");
+
+	auto gpus = gpuPlugin->make();
+	cr_assert(gpus.size() > 0, "No GPUs found");
+
+	// just get first cpu
+	auto& gpu = gpus.front();
+
+	size_t count = 0;
+	for(auto& ip : card->ips) {
+		// skip non-dma IPs
+		if(*ip != villas::fpga::Vlnv("xilinx.com:ip:axi_bram_ctrl:"))
+			continue;
+
+		logger->info("Testing {}", *ip);
+
+		auto bram = reinterpret_cast<villas::fpga::ip::Bram*>(ip.get());
+		cr_assert_not_null(bram, "Couldn't find BRAM");
+
+		count++;
+
+		size_t len = 4 * (1 << 10);
+
+		/* Allocate memory to use with DMA */
+
+		auto bram0 = bram->getAllocator().allocate<char>(len);
+		auto bram1 = bram->getAllocator().allocate<char>(len);
+
+		gpu->makeAccessibleFromPCIeOrHostRam(bram0.getMemoryBlock());
+		gpu->makeAccessibleFromPCIeOrHostRam(bram1.getMemoryBlock());
+
+		auto hostRam0 = villas::HostRam::getAllocator().allocate<char>(len);
+		auto hostRam1 = villas::HostRam::getAllocator().allocate<char>(len);
+
+		gpu->makeAccessibleFromPCIeOrHostRam(hostRam0.getMemoryBlock());
+		gpu->makeAccessibleFromPCIeOrHostRam(hostRam1.getMemoryBlock());
+
+		auto dmaRam0 = villas::HostDmaRam::getAllocator().allocate<char>(len);
+		auto dmaRam1 = villas::HostDmaRam::getAllocator().allocate<char>(len);
+
+		gpu->makeAccessibleFromPCIeOrHostRam(dmaRam0.getMemoryBlock());
+		gpu->makeAccessibleFromPCIeOrHostRam(dmaRam1.getMemoryBlock());
+
+		auto gpuMem0 = gpu->getAllocator().allocate<char>(64 << 10);
+		auto gpuMem1 = gpu->getAllocator().allocate<char>(64 << 10);
+
+		gpu->makeAccessibleToPCIeAndVA(gpuMem0.getMemoryBlock());
+		gpu->makeAccessibleToPCIeAndVA(gpuMem1.getMemoryBlock());
+
+
+//		auto& src = bram0;
+//		auto& dst = bram1;
+
+//		auto& src = hostRam0;
+//		auto& dst = hostRam1;
+
+		auto& src = dmaRam0;
+//		auto& dst = dmaRam1;
+
+//		auto& src = gpuMem0;
+		auto& dst = gpuMem1;
+
+
+		std::list<std::pair<std::string, std::function<void()>>> memcpyFuncs = {
+		    {"cudaMemcpy", [&]() {gpu->memcpySync(src.getMemoryBlock(), dst.getMemoryBlock(), len);}},
+		    {"CUDA kernel", [&]() {gpu->memcpyKernel(src.getMemoryBlock(), dst.getMemoryBlock(), len);}},
+	    };
+
+		auto dmaIp = card->lookupIp(villas::fpga::Vlnv("xilinx.com:ip:axi_dma:"));
+		auto dma = dynamic_cast<villas::fpga::ip::Dma*>(dmaIp);
+
+		if(dma != nullptr and dma->connectLoopback()) {
+			memcpyFuncs.push_back({
+			    "DMA memcpy", [&]() {
+			        if(not dma->makeAccesibleFromVA(src.getMemoryBlock()) or
+			           not dma->makeAccesibleFromVA(dst.getMemoryBlock())) {
+			            return;
+			        }
+			        dma->memcpy(src.getMemoryBlock(), dst.getMemoryBlock(), len);
+			    }});
+		}
+
+		for(auto& [name, memcpyFunc] : memcpyFuncs) {
+			logger->info("Testing {}", name);
+
+			/* Get new random data */
+			const size_t lenRandom = read_random(&src, len);
+			cr_assert(len == lenRandom, "Failed to get random data");
+
+			memcpyFunc();
+			const bool success = memcmp(&src, &dst, len) == 0;
+
+			logger->info("  {}", success ?
+			                 TXT_GREEN("Passed") :
+			                 TXT_RED("Failed"));
+		}
+
+		villas::MemoryManager::get().dump();
+	}
+
+
+	cr_assert(count > 0, "No BRAM found");
+}