From 94ba899b21efd06ba15141881a5ea7cbc6229da3 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 11:22:29 +0200
Subject: [PATCH 01/31] vfio: track if IOMMU is enabled to prepare for
 non-IOMMU mode

---
 fpga/include/villas/kernel/vfio.hpp |  4 ++++
 fpga/lib/kernel/vfio.cpp            | 37 ++++++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 3 deletions(-)
diff --git a/fpga/include/villas/kernel/vfio.hpp b/fpga/include/villas/kernel/vfio.hpp
index 9f794baf5..0e3fc453f 100644
--- a/fpga/include/villas/kernel/vfio.hpp
+++ b/fpga/include/villas/kernel/vfio.hpp
@@ -135,6 +135,9 @@ public:
 	/** munmap() a region which has been mapped by vfio_map_region() */
 	bool memoryUnmap(uintptr_t phys, size_t length);
 
+	bool isIommuEnabled() const
+	{ return this->hasIommu; }
+
 private:
 	VfioGroup& getOrAttachGroup(int index);
 
@@ -143,6 +146,7 @@ private:
 	int version;
 	int extensions;
 	uint64_t iova_next;			/**< Next free IOVA address */
+	bool hasIommu;
 
 	/// All groups bound to this container
 	std::list<std::unique_ptr<VfioGroup>> groups;
diff --git a/fpga/lib/kernel/vfio.cpp b/fpga/lib/kernel/vfio.cpp
index dcc29c2e8..7d29c9ee8 100644
--- a/fpga/lib/kernel/vfio.cpp
+++ b/fpga/lib/kernel/vfio.cpp
@@ -108,12 +108,25 @@ VfioContainer::VfioContainer()
 			logger->error("Failed to get VFIO extensions");
 			throw std::exception();
 		}
-		else if (ret > 0)
+		else if (ret > 0) {
 			extensions |= (1 << i);
+		}
 	}
 
-	logger->debug("Version: {:#x}", version);
+	hasIommu = false;
+
+	if(not (extensions & (1 << VFIO_NOIOMMU_IOMMU))) {
+		if(not (extensions & (1 << VFIO_TYPE1_IOMMU))) {
+			logger->error("No supported IOMMU extension found");
+			throw std::exception();
+		} else {
+			hasIommu = true;
+		}
+	}
+
+	logger->debug("Version:    {:#x}", version);
 	logger->debug("Extensions: {:#x}", extensions);
+	logger->debug("IOMMU:      {}", hasIommu ? "yes" : "no");
 }
 
 
@@ -319,6 +332,11 @@ VfioContainer::memoryMap(uintptr_t virt, uintptr_t phys, size_t length)
 {
 	int ret;
 
+	if(not hasIommu) {
+		logger->error("DMA mapping not supported without IOMMU");
+		return UINTPTR_MAX;
+	}
+
 	if (length & 0xFFF) {
 		length += 0x1000;
 		length &= ~0xFFF;
@@ -363,6 +381,10 @@ VfioContainer::memoryUnmap(uintptr_t phys, size_t length)
 {
 	int ret;
 
+	if(not hasIommu) {
+		return true;
+	}
+
 	struct vfio_iommu_type1_dma_unmap dmaUnmap;
 	dmaUnmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
 	dmaUnmap.flags = 0;
@@ -554,6 +576,11 @@ VfioDevice::pciHotReset()
 	free(reset);
 	free(reset_info);
 
+	if(not success and not group.container->isIommuEnabled()) {
+		logger->info("PCI hot reset failed, but this is expected without IOMMU");
+		return true;
+	}
+
 	return success;
 }
 
@@ -740,7 +767,11 @@ VfioGroup::attach(int containerFd, int groupIndex)
 	}
 
 	/* Set IOMMU type */
-	ret = ioctl(containerFd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
+	int iommu_type = container.isIommuEnabled() ?
+	                     VFIO_TYPE1_IOMMU :
+	                     VFIO_NOIOMMU_IOMMU;
+
+	ret = ioctl(containerFd, VFIO_SET_IOMMU, iommu_type);
 	if (ret < 0) {
 		logger->error("Failed to set IOMMU type of container: {}", ret);
 		return nullptr;

From b6ff452e53c639de9b6aea520c85074a829a89ab Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 11:23:48 +0200
Subject: [PATCH 02/31] vfio: minor refactoring

---
 fpga/include/villas/kernel/vfio.hpp |  8 ++++++--
 fpga/lib/kernel/vfio.cpp            | 26 +++++++++++++++-----------
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/fpga/include/villas/kernel/vfio.hpp b/fpga/include/villas/kernel/vfio.hpp
index 0e3fc453f..1a09e2e16 100644
--- a/fpga/include/villas/kernel/vfio.hpp
+++ b/fpga/include/villas/kernel/vfio.hpp
@@ -18,7 +18,8 @@
 #include <linux/vfio.h>
 #include <sys/mman.h>
 
-#define VFIO_DEV(x)	"/dev/vfio/" x
+#define VFIO_PATH	"/dev/vfio/"
+#define VFIO_DEV	VFIO_PATH "vfio"
 
 /* Forward declarations */
 struct pci_device;
@@ -90,7 +91,7 @@ public:
 	~VfioGroup();
 
 	static std::unique_ptr<VfioGroup>
-	attach(int containerFd, int groupIndex);
+	attach(const VfioContainer& container, int groupIndex);
 
 private:
 	/// VFIO group file descriptor
@@ -138,6 +139,9 @@ public:
 	bool isIommuEnabled() const
 	{ return this->hasIommu; }
 
+	const int& getFd() const
+	{ return fd; }
+
 private:
 	VfioGroup& getOrAttachGroup(int index);
 
diff --git a/fpga/lib/kernel/vfio.cpp b/fpga/lib/kernel/vfio.cpp
index 7d29c9ee8..67cc6d19e 100644
--- a/fpga/lib/kernel/vfio.cpp
+++ b/fpga/lib/kernel/vfio.cpp
@@ -87,7 +87,7 @@ VfioContainer::VfioContainer()
 	}
 
 	/* Open VFIO API */
-	fd = open(VFIO_DEV("vfio"), O_RDWR);
+	fd = open(VFIO_DEV, O_RDWR);
 	if (fd < 0) {
 		logger->error("Failed to open VFIO container");
 		throw std::exception();
@@ -412,7 +412,7 @@ VfioContainer::getOrAttachGroup(int index)
 	}
 
 	// group not yet part of this container, so acquire ownership
-	auto group = VfioGroup::attach(fd, index);
+	auto group = VfioGroup::attach(*this, index);
 	if(not group) {
 		logger->error("Failed to attach to IOMMU group: {}", index);
 		throw std::exception();
@@ -538,6 +538,7 @@ VfioDevice::pciHotReset()
 
 	const size_t reset_infolen = sizeof(struct vfio_pci_hot_reset_info) +
 	                             sizeof(struct vfio_pci_dependent_device) * 64;
+
 	auto reset_info = reinterpret_cast<struct vfio_pci_hot_reset_info*>
 	                    (calloc(1, reset_infolen));
 
@@ -562,6 +563,8 @@ VfioDevice::pciHotReset()
 		}
 	}
 
+	free(reset_info);
+
 	const size_t resetlen = sizeof(struct vfio_pci_hot_reset) +
 	                        sizeof(int32_t) * 1;
 	auto reset = reinterpret_cast<struct vfio_pci_hot_reset*>
@@ -571,10 +574,10 @@ VfioDevice::pciHotReset()
 	reset->count = 1;
 	reset->group_fds[0] = this->group.fd;
 
-	const bool success = ioctl(this->fd, VFIO_DEVICE_PCI_HOT_RESET, reset) == 0;
+	int ret = ioctl(this->fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
+	const bool success = (ret == 0);
 
 	free(reset);
-	free(reset_info);
 
 	if(not success and not group.container->isIommuEnabled()) {
 		logger->info("PCI hot reset failed, but this is expected without IOMMU");
@@ -740,13 +743,16 @@ VfioGroup::~VfioGroup()
 
 
 std::unique_ptr<VfioGroup>
-VfioGroup::attach(int containerFd, int groupIndex)
+VfioGroup::attach(const VfioContainer& container, int groupIndex)
 {
 	std::unique_ptr<VfioGroup> group { new VfioGroup(groupIndex) };
 
 	/* Open group fd */
 	std::stringstream groupPath;
-	groupPath << VFIO_DEV("") << groupIndex;
+	groupPath << VFIO_PATH
+	          << (container.isIommuEnabled() ? "" : "noiommu-")
+	          << groupIndex;
+
 	group->fd = open(groupPath.str().c_str(), O_RDWR);
 	if (group->fd < 0) {
 		logger->error("Failed to open VFIO group {}", group->index);
@@ -756,13 +762,11 @@ VfioGroup::attach(int containerFd, int groupIndex)
 	logger->debug("VFIO group {} (fd {}) has path {}",
 	              groupIndex, group->fd, groupPath.str());
 
-	int ret;
-
 	/* Claim group ownership */
-	ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &containerFd);
+	int ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container.getFd());
 	if (ret < 0) {
 		logger->error("Failed to attach VFIO group {} to container fd {} (error {})",
-		              group->index, containerFd, ret);
+		              group->index, container.getFd(), ret);
 		return nullptr;
 	}
 
@@ -771,7 +775,7 @@ VfioGroup::attach(int containerFd, int groupIndex)
 	                     VFIO_TYPE1_IOMMU :
 	                     VFIO_NOIOMMU_IOMMU;
 
-	ret = ioctl(containerFd, VFIO_SET_IOMMU, iommu_type);
+	ret = ioctl(container.getFd(), VFIO_SET_IOMMU, iommu_type);
 	if (ret < 0) {
 		logger->error("Failed to set IOMMU type of container: {}", ret);
 		return nullptr;

From 80386d1085f3110cd6e3fbf80337887bf121c564 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 24 Apr 2018 13:11:25 +0200
Subject: [PATCH 03/31] vfio: correctly set container on group

---
 fpga/include/villas/kernel/vfio.hpp | 2 +-
 fpga/lib/kernel/vfio.cpp            | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fpga/include/villas/kernel/vfio.hpp b/fpga/include/villas/kernel/vfio.hpp
index 1a09e2e16..473873d04 100644
--- a/fpga/include/villas/kernel/vfio.hpp
+++ b/fpga/include/villas/kernel/vfio.hpp
@@ -91,7 +91,7 @@ public:
 	~VfioGroup();
 
 	static std::unique_ptr<VfioGroup>
-	attach(const VfioContainer& container, int groupIndex);
+	attach(VfioContainer& container, int groupIndex);
 
 private:
 	/// VFIO group file descriptor
diff --git a/fpga/lib/kernel/vfio.cpp b/fpga/lib/kernel/vfio.cpp
index 67cc6d19e..2bf90b590 100644
--- a/fpga/lib/kernel/vfio.cpp
+++ b/fpga/lib/kernel/vfio.cpp
@@ -743,10 +743,12 @@ VfioGroup::~VfioGroup()
 
 
 std::unique_ptr<VfioGroup>
-VfioGroup::attach(const VfioContainer& container, int groupIndex)
+VfioGroup::attach(VfioContainer& container, int groupIndex)
 {
 	std::unique_ptr<VfioGroup> group { new VfioGroup(groupIndex) };
 
+	group->container = &container;
+
 	/* Open group fd */
 	std::stringstream groupPath;
 	groupPath << VFIO_PATH

From c3993a22c6acfa0d245259f02aaa4080a8db5903 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 24 Apr 2018 13:12:02 +0200
Subject: [PATCH 04/31] vfio: IOMMU group is always 0 if no IOMMU is present

---
 fpga/lib/kernel/vfio.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fpga/lib/kernel/vfio.cpp b/fpga/lib/kernel/vfio.cpp
index 2bf90b590..4451536ba 100644
--- a/fpga/lib/kernel/vfio.cpp
+++ b/fpga/lib/kernel/vfio.cpp
@@ -302,7 +302,7 @@ VfioContainer::attachDevice(const pci_device* pdev)
 	}
 
 	/* Get IOMMU group of device */
-	int index = pci_get_iommu_group(pdev);
+	int index = isIommuEnabled() ? pci_get_iommu_group(pdev) : 0;
 	if (index < 0) {
 		logger->error("Failed to get IOMMU group of device");
 		throw std::exception();

From 5b8f573337f0a3174ec4f16f55b7824cd87005c8 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 24 Apr 2018 13:12:32 +0200
Subject: [PATCH 05/31] json: parse 64bit numbers, this is required for numbers
 > 2^31

Our current JSON library jansson only parses signed integers, so
it cannot correctly parse numbers between 2^31 and 2^32 into a
32 bit type.
---
 fpga/lib/ip.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fpga/lib/ip.cpp b/fpga/lib/ip.cpp
index 63cffdebc..0a21d16e3 100644
--- a/fpga/lib/ip.cpp
+++ b/fpga/lib/ip.cpp
@@ -223,8 +223,8 @@ IpCoreFactory::make(PCIeCard* card, json_t *json_ips)
 					json_t* json_block;
 					json_object_foreach(json_instance, block_name, json_block) {
 
-						unsigned int base, high, size;
-						int ret = json_unpack(json_block, "{ s: i, s: i, s: i }",
+						json_int_t base, high, size;
+						int ret = json_unpack(json_block, "{ s: I, s: I, s: I }",
 						                      "baseaddr", &base,
 						                      "highaddr", &high,
 						                      "size", &size);

From 01803abadec116cf8fcd13441f07b4bb3657a8f4 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 15:08:30 +0200
Subject: [PATCH 06/31] hwdef-parse: parse PCI and AXI BARs

---
 fpga/scripts/hwdef-parse.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/fpga/scripts/hwdef-parse.py b/fpga/scripts/hwdef-parse.py
index 12d96e4f2..cb617f703 100755
--- a/fpga/scripts/hwdef-parse.py
+++ b/fpga/scripts/hwdef-parse.py
@@ -255,4 +255,27 @@ for bram in brams:
 	if instance in ips:
 		ips[instance]['size'] = int(size)
 
+pcies = root.xpath('.//MODULE[@MODTYPE="axi_pcie"]')
+for pcie in pcies:
+	instance = pcie.get('INSTANCE')
+	axi_bars = ips[instance].setdefault('axi_bars', {})
+	pcie_bars = ips[instance].setdefault('pcie_bars', {})
+
+	for from_bar, to_bar, from_bars in (('AXIBAR', 'PCIEBAR', axi_bars), ('PCIEBAR', 'AXIBAR', pcie_bars)):
+		from_bar_num = int(pcie.find('.//PARAMETER[@NAME="C_{}_NUM"]'.format(from_bar)).get('VALUE'))
+
+		for i in range(0, from_bar_num):
+			from_bar_to_bar_offset = int(pcie.find('.//PARAMETER[@NAME="C_{}2{}_{}"]'.format(from_bar, to_bar, i)).get('VALUE'), 16)
+			from_bars['BAR{}'.format(i)] = { 'translation': from_bar_to_bar_offset }
+
+			if from_bar == 'AXIBAR':
+				axi_bar_lo = int(pcie.find('.//PARAMETER[@NAME="C_{}_{}"]'.format(from_bar, i)).get('VALUE'), 16)
+				axi_bar_hi = int(pcie.find('.//PARAMETER[@NAME="C_{}_HIGHADDR_{}"]'.format(from_bar, i)).get('VALUE'), 16)
+				axi_bar_size = axi_bar_hi - axi_bar_lo + 1
+
+				axi_bar = from_bars['BAR{}'.format(i)]
+				axi_bar['baseaddr'] = axi_bar_lo
+				axi_bar['highaddr'] = axi_bar_hi
+				axi_bar['size'] = axi_bar_size
+
 print(json.dumps(ips, indent=2))

From 68e5481d97d6dc24e41f974bd47390b31614a84f Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 24 Apr 2018 13:14:41 +0200
Subject: [PATCH 07/31] config: new config for changed bitstream

AXI-BAR0 on PCIe bridge now allows access to whole PCI address space.
---
 fpga/etc/fpga.json | 52 ++++++++++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/fpga/etc/fpga.json b/fpga/etc/fpga.json
index fe29ef418..5f982e2a8 100644
--- a/fpga/etc/fpga.json
+++ b/fpga/etc/fpga.json
@@ -120,30 +120,30 @@
             "M_AXI_MM2S": {
               "pcie_0_axi_pcie_0": {
                 "BAR0": {
-                  "baseaddr": 2147483648,
+                  "baseaddr": 0,
                   "highaddr": 4294967295,
-                  "size": 2147483648
+                  "size": 4294967296
                 }
               }
             },
             "M_AXI_S2MM": {
               "pcie_0_axi_pcie_0": {
                 "BAR0": {
-                  "baseaddr": 2147483648,
+                  "baseaddr": 0,
                   "highaddr": 4294967295,
-                  "size": 2147483648
+                  "size": 4294967296
                 }
               }
             }
           },
           "ports": [
             {
-              "role": "initiator",
+              "role": "master",
               "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:1",
               "name": "MM2S"
             },
             {
-              "role": "target",
+              "role": "slave",
               "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:1",
               "name": "S2MM"
             }
@@ -159,30 +159,30 @@
             "M_AXI_MM2S": {
               "pcie_0_axi_pcie_0": {
                 "BAR0": {
-                  "baseaddr": 2147483648,
+                  "baseaddr": 0,
                   "highaddr": 4294967295,
-                  "size": 2147483648
+                  "size": 4294967296
                 }
               }
             },
             "M_AXI_S2MM": {
               "pcie_0_axi_pcie_0": {
                 "BAR0": {
-                  "baseaddr": 2147483648,
+                  "baseaddr": 0,
                   "highaddr": 4294967295,
-                  "size": 2147483648
+                  "size": 4294967296
                 }
               }
             }
           },
           "ports": [
             {
-              "role": "initiator",
+              "role": "master",
               "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:6",
               "name": "MM2S"
             },
             {
-              "role": "target",
+              "role": "slave",
               "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:6",
               "name": "S2MM"
             }
@@ -214,22 +214,22 @@
           "vlnv": "xilinx.com:ip:axis_switch:1.1",
           "ports": [
             {
-              "role": "initiator",
+              "role": "master",
               "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:3",
               "name": "M03_AXIS"
             },
             {
-              "role": "target",
+              "role": "slave",
               "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:3",
               "name": "S03_AXIS"
             },
             {
-              "role": "initiator",
+              "role": "master",
               "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:4",
               "name": "M04_AXIS"
             },
             {
-              "role": "target",
+              "role": "slave",
               "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:4",
               "name": "S04_AXIS"
             }
@@ -237,7 +237,7 @@
           "num_ports": 7
         },
         "hier_0_hls_dft_0": {
-          "vlnv": "acs.eonerc.rwth-aachen.de:hls:hls_dft:1.0",
+          "vlnv": "acs.eonerc.rwth-aachen.de:hls:hls_dft:1.1",
           "ports": [
             {
               "role": "master",
@@ -249,7 +249,10 @@
               "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:5",
               "name": "input_r"
             }
-          ]
+          ],
+          "irqs": {
+            "interrupt": "pcie_0_axi_pcie_intc_0:1"
+          }
         },
         "hier_0_rtds_axis_0": {
           "vlnv": "acs.eonerc.rwth-aachen.de:user:rtds_axis:1.0",
@@ -358,6 +361,19 @@
                 }
               }
             }
+          },
+          "axi_bars": {
+            "BAR0": {
+              "translation": 0,
+              "baseaddr": 0,
+              "highaddr": 4294967295,
+              "size": 4294967296
+            }
+          },
+          "pcie_bars": {
+            "BAR0": {
+              "translation": 0
+            }
           }
         },
         "pcie_0_axi_pcie_intc_0": {

From 9490594167439937cc669c7215ee4f07d67cc7c7 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 12:00:31 +0200
Subject: [PATCH 08/31] allocator: properly remove memory block from memory
 graph

---
 fpga/include/villas/memory.hpp | 2 ++
 fpga/lib/common/memory.cpp     | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/fpga/include/villas/memory.hpp b/fpga/include/villas/memory.hpp
index 8a1895b23..12779bf01 100644
--- a/fpga/include/villas/memory.hpp
+++ b/fpga/include/villas/memory.hpp
@@ -116,6 +116,8 @@ public:
 		free = [&](MemoryBlock* mem) {
 			logger->warn("no free callback defined for addr space {}, not freeing",
 			             mem->getAddrSpaceId());
+
+			removeMemoryBlock(*mem);
 		};
 	}
 
diff --git a/fpga/lib/common/memory.cpp b/fpga/lib/common/memory.cpp
index f2def1432..c6d5b386e 100644
--- a/fpga/lib/common/memory.cpp
+++ b/fpga/lib/common/memory.cpp
@@ -62,6 +62,8 @@ LinearAllocator::LinearAllocator(MemoryManager::AddressSpaceId memoryAddrSpaceId
 		              mem->getSize(), mem->getOffset(), mem->getAddrSpaceId());
 		logger->warn("free() not implemented");
 		logger->debug("available memory: {:#x} bytes", getAvailableMemory());
+
+		removeMemoryBlock(*mem);
 	};
 }
 
@@ -133,6 +135,8 @@ HostRam::HostRamAllocator::HostRamAllocator() :
 			logger->warn("munmap() failed for {:#x} of size {:#x}",
 			             mem->getOffset(), mem->getSize());
 		}
+
+		removeMemoryBlock(*mem);
 	};
 }
 

From d81fc6fe11a61858602687272faa7010cca70a4b Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Fri, 13 Apr 2018 15:59:34 +0200
Subject: [PATCH 09/31] gpu: add empty library for GPU-related stuff

---
 fpga/lib/CMakeLists.txt             |  2 ++
 fpga/lib/gpu/CMakeLists.txt         | 14 ++++++++++++++
 fpga/lib/gpu/gpu.cpp                |  7 +++++++
 fpga/lib/gpu/include/villas/gpu.hpp |  7 +++++++
 4 files changed, 30 insertions(+)
 create mode 100644 fpga/lib/gpu/CMakeLists.txt
 create mode 100644 fpga/lib/gpu/gpu.cpp
 create mode 100644 fpga/lib/gpu/include/villas/gpu.hpp

diff --git a/fpga/lib/CMakeLists.txt b/fpga/lib/CMakeLists.txt
index 834a7c856..fff01b8e1 100644
--- a/fpga/lib/CMakeLists.txt
+++ b/fpga/lib/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(common)
+add_subdirectory(gpu)
 
 set(SOURCES
 	vlnv.cpp
@@ -35,6 +36,7 @@ find_package(Threads)
 add_library(villas-fpga SHARED ${SOURCES})
 
 target_link_libraries(villas-fpga PUBLIC villas-common)
+target_link_libraries(villas-fpga PUBLIC villas-gpu)
 
 target_compile_definitions(villas-fpga PRIVATE
 	BUILDID=\"abc\"
diff --git a/fpga/lib/gpu/CMakeLists.txt b/fpga/lib/gpu/CMakeLists.txt
new file mode 100644
index 000000000..68c3993ea
--- /dev/null
+++ b/fpga/lib/gpu/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.7)
+
+project(villas-gpu
+	VERSION 1.0
+	DESCRIPTION "VILLASgpu"
+	LANGUAGES C CXX)
+
+# fail if CUDA not found
+find_package(CUDA QUIET REQUIRED)
+
+cuda_add_library(villas-gpu SHARED gpu.cpp)
+
+target_include_directories(villas-gpu
+	PUBLIC ${CMAKE_CURRENT_LIST_DIR}/include)
diff --git a/fpga/lib/gpu/gpu.cpp b/fpga/lib/gpu/gpu.cpp
new file mode 100644
index 000000000..4314fcb65
--- /dev/null
+++ b/fpga/lib/gpu/gpu.cpp
@@ -0,0 +1,7 @@
+#include <villas/gpu.hpp>
+
+namespace villas {
+namespace gpu {
+
+} // namespace villas
+} // namespace gpu
diff --git a/fpga/lib/gpu/include/villas/gpu.hpp b/fpga/lib/gpu/include/villas/gpu.hpp
new file mode 100644
index 000000000..903dd08c2
--- /dev/null
+++ b/fpga/lib/gpu/include/villas/gpu.hpp
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace villas {
+namespace gpu {
+
+} // namespace villas
+} // namespace gpu

From 3f71793327446bf30f98af49a5b87adfb70f472c Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Wed, 18 Apr 2018 21:56:24 +0200
Subject: [PATCH 10/31] gpu: add gdrcopy submodule

---
 fpga/.gitmodules     | 3 +++
 fpga/lib/gpu/gdrcopy | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 fpga/lib/gpu/gdrcopy

diff --git a/fpga/.gitmodules b/fpga/.gitmodules
index e67658aa1..8bbca3dab 100644
--- a/fpga/.gitmodules
+++ b/fpga/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "thirdparty/libxil"]
 	path = thirdparty/libxil
 	url = https://git.rwth-aachen.de/acs/public/villas/libxil.git
+[submodule "lib/gpu/gdrcopy"]
+	path = lib/gpu/gdrcopy
+	url = https://github.com/daniel-k/gdrcopy.git
diff --git a/fpga/lib/gpu/gdrcopy b/fpga/lib/gpu/gdrcopy
new file mode 160000
index 000000000..2b933176d
--- /dev/null
+++ b/fpga/lib/gpu/gdrcopy
@@ -0,0 +1 @@
+Subproject commit 2b933176d0fd20f10bddfdf574a1d3229ca1ecdf

From 29709aed7abd566dbcfa61571f282a47c6c689fa Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 11:45:32 +0200
Subject: [PATCH 11/31] directed-graph: make compile with C++11 (no C++17 with
 CUDA)

---
 fpga/include/villas/directed_graph.hpp | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/fpga/include/villas/directed_graph.hpp b/fpga/include/villas/directed_graph.hpp
index 6335d8087..b75912335 100644
--- a/fpga/include/villas/directed_graph.hpp
+++ b/fpga/include/villas/directed_graph.hpp
@@ -81,7 +81,7 @@ public:
 
 	std::shared_ptr<VertexType> getVertex(VertexIdentifier vertexId) const
 	{
-		if(vertexId < 0 or vertexId >= lastVertexId)
+		if(vertexId >= lastVertexId)
 			throw std::invalid_argument("vertex doesn't exist");
 
 		// cannot use [] operator, because creates non-existing elements
@@ -92,7 +92,10 @@ public:
 	template<class UnaryPredicate>
 	VertexIdentifier findVertex(UnaryPredicate p)
 	{
-		for(auto& [vertexId, vertex] : vertices) {
+		for(auto& v : vertices) {
+			auto& vertexId = v.first;
+			auto& vertex = v.second;
+
 			if(p(vertex)) {
 				return vertexId;
 			}
@@ -103,7 +106,7 @@ public:
 
 	std::shared_ptr<EdgeType> getEdge(EdgeIdentifier edgeId) const
 	{
-		if(edgeId < 0 or edgeId >= lastEdgeId)
+		if(edgeId >= lastEdgeId)
 			throw std::invalid_argument("edge doesn't exist");
 
 		// cannot use [] operator, because creates non-existing elements
@@ -177,7 +180,9 @@ public:
 		// delete every edge that start or ends at this vertex
 		auto it = edges.begin();
 		while(it != edges.end()) {
-			auto& [edgeId, edge] = *it;
+			auto& edgeId = it->first;
+			auto& edge = it->second;
+
 			bool removeEdge = false;
 
 			if(edge->to == vertexId) {
@@ -255,8 +260,8 @@ public:
 	void dump()
 	{
 		logger->info("Vertices:");
-		for(auto& [vertexId, vertex] : vertices) {
-			(void) vertexId;
+		for(auto& v : vertices) {
+			auto& vertex = v.second;
 
 			// format connected vertices into a list
 			std::stringstream ssEdges;
@@ -268,8 +273,8 @@ public:
 		}
 
 		logger->info("Edges:");
-		for(auto& [edgeId, edge] : edges) {
-			(void) edgeId;
+		for(auto& e : edges) {
+			auto& edge = e.second;
 
 			logger->info("  {}: {} -> {}", *edge, edge->from, edge->to);
 		}

From 105f47d2d088949a2eb867803bf69635228fc040 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 12:13:51 +0200
Subject: [PATCH 12/31] common/memory: add check-callback to getPath() to
 select desired path

This is a workaround until we have a better heuristic (maybe shortest
path?) to choose between multiple paths in the graph. Since the (abstract)
graph has no idea about memory translations, getPath() may even yield
paths that are no valid translation because a pair of inbound/outbound
edges must not neccessarily share a common address window, but from the
perspective of the abstract graph present a valid path.
The callback function is used by the MemoryManager to verify if a path
candidate represents a valid translation.
---
 fpga/include/villas/directed_graph.hpp | 13 +++++++++--
 fpga/include/villas/memory_manager.hpp | 14 +++++++++++-
 fpga/include/villas/utils.hpp          |  9 ++++++++
 fpga/lib/common/memory_manager.cpp     | 30 +++++++++++++++++++++++---
 4 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/fpga/include/villas/directed_graph.hpp b/fpga/include/villas/directed_graph.hpp
index b75912335..48899a606 100644
--- a/fpga/include/villas/directed_graph.hpp
+++ b/fpga/include/villas/directed_graph.hpp
@@ -212,9 +212,17 @@ public:
 	vertexGetEdges(VertexIdentifier vertexId) const
 	{ return getVertex(vertexId)->edges; }
 
+
+	using check_path_fn = std::function<bool(const Path&)>;
+
+	static bool
+	checkPath(const Path&)
+	{ return true; }
+
 	bool getPath(VertexIdentifier fromVertexId,
 	             VertexIdentifier toVertexId,
-	             Path& path)
+	             Path& path,
+	             check_path_fn pathCheckFunc = checkPath)
 	{
 		if(fromVertexId == toVertexId) {
 			// arrived at the destination
@@ -244,7 +252,8 @@ public:
 				path.push_back(edgeId);
 
 				// recursive, depth-first search
-				if(getPath(edgeOfFromVertex->to, toVertexId, path)) {
+				if(getPath(edgeOfFromVertex->to, toVertexId, path, pathCheckFunc) and
+				   pathCheckFunc(path)) {
 					// path found, we're done
 				    return true;
 				} else {
diff --git a/fpga/include/villas/memory_manager.hpp b/fpga/include/villas/memory_manager.hpp
index 96d1ccda2..012910509 100644
--- a/fpga/include/villas/memory_manager.hpp
+++ b/fpga/include/villas/memory_manager.hpp
@@ -3,6 +3,7 @@
 #include <cstdint>
 #include <string>
 #include <map>
+#include <stdexcept>
 #include <unistd.h>
 
 #include "log.hpp"
@@ -73,7 +74,12 @@ private:
 	// This is a singleton, so private constructor ...
 	MemoryManager() :
 	    memoryGraph("MemoryGraph"),
-	    logger(loggerGetOrCreate("MemoryManager")) {}
+	    logger(loggerGetOrCreate("MemoryManager"))
+	{
+		pathCheckFunc = [&](const MemoryGraph::Path& path) {
+			return this->pathCheck(path);
+		};
+	}
 
 	// ... and no copying or assigning
 	MemoryManager(const MemoryManager&) = delete;
@@ -144,6 +150,8 @@ public:
 	using AddressSpaceId = MemoryGraph::VertexIdentifier;
 	using MappingId = MemoryGraph::EdgeIdentifier;
 
+	struct InvalidTranslation : public std::exception {};
+
 	/// Get singleton instance
 	static MemoryManager&
 	get();
@@ -210,6 +218,8 @@ private:
 	getTranslationFromMapping(const Mapping& mapping)
 	{ return MemoryTranslation(mapping.src, mapping.dest, mapping.size); }
 
+	bool
+	pathCheck(const MemoryGraph::Path& path);
 
 private:
 	/// Directed graph that stores address spaces and memory mappings
@@ -221,6 +231,8 @@ private:
 	/// Logger for universal access in this class
 	SpdLogger logger;
 
+	MemoryGraph::check_path_fn pathCheckFunc;
+
 	/// Static pointer to global instance, because this is a singleton
 	static MemoryManager* instance;
 };
diff --git a/fpga/include/villas/utils.hpp b/fpga/include/villas/utils.hpp
index b2b5b314c..d368de915 100644
--- a/fpga/include/villas/utils.hpp
+++ b/fpga/include/villas/utils.hpp
@@ -10,6 +10,15 @@ namespace utils {
 std::vector<std::string>
 tokenize(std::string s, std::string delimiter);
 
+
+template<typename T>
+void
+assertExcept(bool condition, const T& exception)
+{
+	if(not condition)
+		throw exception;
+}
+
 } // namespace utils
 } // namespace villas
 
diff --git a/fpga/lib/common/memory_manager.cpp b/fpga/lib/common/memory_manager.cpp
index f8e01e9b0..d832896c6 100644
--- a/fpga/lib/common/memory_manager.cpp
+++ b/fpga/lib/common/memory_manager.cpp
@@ -2,8 +2,11 @@
 #include <limits>
 #include <cstdint>
 
+#include <villas/utils.hpp>
 #include "memory_manager.hpp"
 
+using namespace villas::utils;
+
 namespace villas {
 
 MemoryManager*
@@ -76,7 +79,8 @@ MemoryManager::getTranslation(MemoryManager::AddressSpaceId fromAddrSpaceId,
 {
 	// find a path through the memory graph
 	MemoryGraph::Path path;
-	if(not memoryGraph.getPath(fromAddrSpaceId, toAddrSpaceId, path)) {
+	if(not memoryGraph.getPath(fromAddrSpaceId, toAddrSpaceId, path, pathCheckFunc)) {
+
 		auto fromAddrSpace = memoryGraph.getVertex(fromAddrSpaceId);
 		auto toAddrSpace = memoryGraph.getVertex(toAddrSpaceId);
 
@@ -98,6 +102,26 @@ MemoryManager::getTranslation(MemoryManager::AddressSpaceId fromAddrSpaceId,
 	return translation;
 }
 
+bool
+MemoryManager::pathCheck(const MemoryGraph::Path& path)
+{
+	// start with an identity mapping
+	MemoryTranslation translation(0, 0, SIZE_MAX);
+
+	// Try to add all mappings together to a common translation. If this fails
+	// there is a non-overlapping window
+	for(auto& mappingId : path) {
+		auto mapping = memoryGraph.getEdge(mappingId);
+		try {
+			translation += getTranslationFromMapping(*mapping);
+		} catch(const InvalidTranslation&) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
 uintptr_t
 MemoryTranslation::getLocalAddr(uintptr_t addrInForeignAddrSpace) const
 {
@@ -125,8 +149,8 @@ MemoryTranslation::operator+=(const MemoryTranslation& other)
 	const uintptr_t other_src_high = other.src + other.size;
 
 	// make sure there is a common memory area
-	assert(other.src < this_dst_high);
-	assert(this->dst < other_src_high);
+	assertExcept(other.src < this_dst_high, MemoryManager::InvalidTranslation());
+	assertExcept(this->dst < other_src_high, MemoryManager::InvalidTranslation());
 
 	const uintptr_t hi = std::max(this_dst_high, other_src_high);
 	const uintptr_t lo = std::min(this->dst, other.src);

From 218008955e6b99ba9c72278f186b2c15772a47cb Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 13:12:26 +0200
Subject: [PATCH 13/31] common/memory: fix memory translation merging

---
 fpga/lib/common/memory_manager.cpp | 60 ++++++++++++++++++------------
 1 file changed, 37 insertions(+), 23 deletions(-)

diff --git a/fpga/lib/common/memory_manager.cpp b/fpga/lib/common/memory_manager.cpp
index d832896c6..88da8536f 100644
--- a/fpga/lib/common/memory_manager.cpp
+++ b/fpga/lib/common/memory_manager.cpp
@@ -83,8 +83,7 @@ MemoryManager::getTranslation(MemoryManager::AddressSpaceId fromAddrSpaceId,
 
 		auto fromAddrSpace = memoryGraph.getVertex(fromAddrSpaceId);
 		auto toAddrSpace = memoryGraph.getVertex(toAddrSpaceId);
-
-		logger->error("No translation found from ({}) to ({})",
+		logger->debug("No translation found from ({}) to ({})",
 		              *fromAddrSpace, *toAddrSpace);
 
 		throw std::out_of_range("no translation found");
@@ -148,6 +147,15 @@ MemoryTranslation::operator+=(const MemoryTranslation& other)
 	const uintptr_t this_dst_high = this->dst + this->size;
 	const uintptr_t other_src_high = other.src + other.size;
 
+	logger->debug("this->src:      {:#x}", this->src);
+	logger->debug("this->dst:      {:#x}", this->dst);
+	logger->debug("this->size:     {:#x}", this->size);
+	logger->debug("other.src:      {:#x}", other.src);
+	logger->debug("other.dst:      {:#x}", other.dst);
+	logger->debug("other.size:     {:#x}", other.size);
+	logger->debug("this_dst_high:  {:#x}", this_dst_high);
+	logger->debug("other_src_high: {:#x}", other_src_high);
+
 	// make sure there is a common memory area
 	assertExcept(other.src < this_dst_high, MemoryManager::InvalidTranslation());
 	assertExcept(this->dst < other_src_high, MemoryManager::InvalidTranslation());
@@ -159,33 +167,39 @@ MemoryTranslation::operator+=(const MemoryTranslation& other)
 	                          ? (this_dst_high - other_src_high)
 	                          : (other_src_high - this_dst_high);
 
-	const uintptr_t diff_lo = (this->dst > other.src)
+	const bool otherSrcIsSmaller = this->dst > other.src;
+	const uintptr_t diff_lo = (otherSrcIsSmaller)
 	                          ? (this->dst - other.src)
 	                          : (other.src - this->dst);
 
-	const size_t size = (hi - lo) - diff_hi - diff_lo;
+	logger->debug("hi:             {:#x}", hi);
+	logger->debug("lo:             {:#x}", lo);
+	logger->debug("diff_hi:        {:#x}", diff_hi);
+	logger->debug("diff_lo:        {:#x}", diff_lo);
 
-	logger->debug("this->src:      0x{:x}", this->src);
-	logger->debug("this->dst:      0x{:x}", this->dst);
-	logger->debug("this->size:     0x{:x}", this->size);
-	logger->debug("other.src:      0x{:x}", other.src);
-	logger->debug("other.dst:      0x{:x}", other.dst);
-	logger->debug("other.size:     0x{:x}", other.size);
-	logger->debug("this_dst_high:  0x{:x}", this_dst_high);
-	logger->debug("other_src_high: 0x{:x}", other_src_high);
-	logger->debug("hi:             0x{:x}", hi);
-	logger->debug("lo:             0x{:x}", lo);
-	logger->debug("diff_hi:        0x{:x}", diff_hi);
-	logger->debug("diff_hi:        0x{:x}", diff_lo);
-	logger->debug("size:           0x{:x}", size);
+	// new size of aperture, can only stay or shrink
+	this->size = (hi - lo) - diff_hi - diff_lo;
 
-	this->src += other.src;
-	this->dst += other.dst;
-	this->size = size;
+	// new translation will come out other's destination (by default)
+	this->dst = other.dst;
 
-	logger->debug("result src:     0x{:x}", this->src);
-	logger->debug("result dst:     0x{:x}", this->dst);
-	logger->debug("result size:    0x{:x}", this->size);
+	// the source stays the same and can only increase with merged translations
+	this->src = this->src;
+
+	if(otherSrcIsSmaller) {
+		// other mapping starts at lower addresses, so we actually arrive at
+		// higher addresses
+		this->dst += diff_lo;
+	} else {
+		// other mapping starts at higher addresses than this, so we have to
+		// increase the start
+		// NOTE: for addresses equality, this just adds 0
+		this->src += diff_lo;
+	}
+
+	logger->debug("result src:     {:#x}", this->src);
+	logger->debug("result dst:     {:#x}", this->dst);
+	logger->debug("result size:    {:#x}", this->size);
 
 	return *this;
 }

From cea353aa7f078f06480ca0a66a79f519792870b6 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 13:13:15 +0200
Subject: [PATCH 14/31] directed-graph: add getters for vertices of an edge

---
 fpga/include/villas/directed_graph.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fpga/include/villas/directed_graph.hpp b/fpga/include/villas/directed_graph.hpp
index 48899a606..0af3abf36 100644
--- a/fpga/include/villas/directed_graph.hpp
+++ b/fpga/include/villas/directed_graph.hpp
@@ -58,6 +58,12 @@ public:
 	operator==(const Edge& other)
 	{ return this->id == other.id; }
 
+	Vertex::Identifier getVertexTo() const
+	{ return to; }
+
+	Vertex::Identifier getVertexFrom() const
+	{ return from; }
+
 private:
 	Identifier id;
 	Vertex::Identifier from;

From 14704907473f0945ca81618b64db5bef0cceb364 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 13:15:29 +0200
Subject: [PATCH 15/31] common/memory: provide findPath() to get a path of
 address spaces

---
 fpga/include/villas/memory_manager.hpp |  3 +++
 fpga/lib/common/memory_manager.cpp     | 27 ++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/fpga/include/villas/memory_manager.hpp b/fpga/include/villas/memory_manager.hpp
index 012910509..877456abe 100644
--- a/fpga/include/villas/memory_manager.hpp
+++ b/fpga/include/villas/memory_manager.hpp
@@ -192,6 +192,9 @@ public:
 	AddressSpaceId
 	findAddressSpace(const std::string& name);
 
+	std::list<AddressSpaceId>
+	findPath(AddressSpaceId fromAddrSpaceId, AddressSpaceId toAddrSpaceId);
+
 	MemoryTranslation
 	getTranslation(AddressSpaceId fromAddrSpaceId, AddressSpaceId toAddrSpaceId);
 
diff --git a/fpga/lib/common/memory_manager.cpp b/fpga/lib/common/memory_manager.cpp
index 88da8536f..0754e0c13 100644
--- a/fpga/lib/common/memory_manager.cpp
+++ b/fpga/lib/common/memory_manager.cpp
@@ -73,6 +73,33 @@ MemoryManager::findAddressSpace(const std::string& name)
 	});
 }
 
+std::list<MemoryManager::AddressSpaceId>
+MemoryManager::findPath(MemoryManager::AddressSpaceId fromAddrSpaceId,
+                        MemoryManager::AddressSpaceId toAddrSpaceId)
+{
+	std::list<AddressSpaceId> path;
+
+	auto fromAddrSpace = memoryGraph.getVertex(fromAddrSpaceId);
+	auto toAddrSpace = memoryGraph.getVertex(toAddrSpaceId);
+
+	// find a path through the memory graph
+	MemoryGraph::Path pathGraph;
+	if(not memoryGraph.getPath(fromAddrSpaceId, toAddrSpaceId, pathGraph, pathCheckFunc)) {
+
+		logger->debug("No translation found from ({}) to ({})",
+		              *fromAddrSpace, *toAddrSpace);
+
+		throw std::out_of_range("no translation found");
+	}
+
+	for(auto& mappingId : pathGraph) {
+		auto mapping = memoryGraph.getEdge(mappingId);
+		path.push_back(mapping->getVertexTo());
+	}
+
+	return path;
+}
+
 MemoryTranslation
 MemoryManager::getTranslation(MemoryManager::AddressSpaceId fromAddrSpaceId,
                               MemoryManager::AddressSpaceId toAddrSpaceId)

From 2477ed4b4b5db3c8d64c817bcc9351d0cf788667 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 13:21:52 +0200
Subject: [PATCH 16/31] common/memory: provide getPciAddressSpace() for a
 common PCIe address space

---
 fpga/include/villas/memory_manager.hpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fpga/include/villas/memory_manager.hpp b/fpga/include/villas/memory_manager.hpp
index 877456abe..387f5c0a2 100644
--- a/fpga/include/villas/memory_manager.hpp
+++ b/fpga/include/villas/memory_manager.hpp
@@ -115,7 +115,7 @@ private:
 			return stream << static_cast<const Edge&>(mapping) << " = "
 			              << mapping.name
 			              << std::hex
-			              << "(src=0x"   << mapping.src
+			              << " (src=0x"  << mapping.src
 			              << ", dest=0x" << mapping.dest
 			              << ", size=0x" << mapping.size
 			              << ")";
@@ -160,6 +160,10 @@ public:
 	getProcessAddressSpace()
 	{ return getOrCreateAddressSpace("villas-fpga"); }
 
+	AddressSpaceId
+	getPciAddressSpace()
+	{ return getOrCreateAddressSpace("PCIe"); }
+
 	AddressSpaceId
 	getProcessAddressSpaceMemoryBlock(const std::string& memoryBlock)
 	{ return getOrCreateAddressSpace(getSlaveAddrSpaceName("villas-fpga", memoryBlock)); }

From 6b7d6941035157d88d07c4ceba72c8fb443ea9f0 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 13:26:59 +0200
Subject: [PATCH 17/31] common/BaseAllocator: test allocated memory for
 accessibility

Write to and read-verify allocated memory block when using allocate()
wrapper.
---
 fpga/include/villas/memory.hpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/fpga/include/villas/memory.hpp b/fpga/include/villas/memory.hpp
index 12779bf01..38b801c45 100644
--- a/fpga/include/villas/memory.hpp
+++ b/fpga/include/villas/memory.hpp
@@ -130,6 +130,21 @@ public:
 	{
 		const size_t size = num * sizeof(T);
 		auto mem = allocateBlock(size);
+
+		// Check if the allocated memory is really accessible by writing to the
+		// allocated memory and reading back. Exponentially increase offset to
+		// speed up testing.
+		MemoryAccessor<volatile uint8_t> byteAccessor(*mem);
+		size_t idx = 0;
+		for(int i = 0; idx < mem->getSize(); i++, idx = (1 << i)) {
+			auto val = static_cast<uint8_t>(i);
+			byteAccessor[idx] = val;
+			if(byteAccessor[idx] != val) {
+				logger->error("Cannot access allocated memory");
+				throw std::bad_alloc();
+			}
+		}
+
 		return MemoryAccessor<T>(std::move(mem));
 	}
 
@@ -181,6 +196,9 @@ public:
 	size_t getAvailableMemory() const
 	{ return memorySize - nextFreeAddress; }
 
+	size_t getSize() const
+	{ return memorySize; }
+
 	std::string getName() const;
 
 	std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>

From 1b2e7d312e2caff88f250dfe44e73a6db60a2f57 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 13:30:22 +0200
Subject: [PATCH 18/31] common/memory: add host DMA memory allocator using
 udmabuf

---
 fpga/.gitmodules               |   3 +
 fpga/include/villas/memory.hpp |  37 ++++++++++
 fpga/lib/common/memory.cpp     | 121 +++++++++++++++++++++++++++++++++
 fpga/thirdparty/udmabuf        |   1 +
 4 files changed, 162 insertions(+)
 create mode 160000 fpga/thirdparty/udmabuf

diff --git a/fpga/.gitmodules b/fpga/.gitmodules
index 8bbca3dab..c457c7c52 100644
--- a/fpga/.gitmodules
+++ b/fpga/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "lib/gpu/gdrcopy"]
 	path = lib/gpu/gdrcopy
 	url = https://github.com/daniel-k/gdrcopy.git
+[submodule "thirdparty/udmabuf"]
+	path = thirdparty/udmabuf
+	url = https://github.com/ikwzm/udmabuf
diff --git a/fpga/include/villas/memory.hpp b/fpga/include/villas/memory.hpp
index 38b801c45..b640743fb 100644
--- a/fpga/include/villas/memory.hpp
+++ b/fpga/include/villas/memory.hpp
@@ -245,4 +245,41 @@ private:
 	static HostRamAllocator allocator;
 };
 
+
+class HostDmaRam {
+private:
+
+	static std::string
+	getUdmaBufName(int num);
+
+	static std::string
+	getUdmaBufBasePath(int num);
+
+	static size_t
+	getUdmaBufBufSize(int num);
+
+	static uintptr_t
+	getUdmaBufPhysAddr(int num);
+
+public:
+	class HostDmaRamAllocator : public LinearAllocator {
+	public:
+		HostDmaRamAllocator(int num);
+
+		virtual ~HostDmaRamAllocator();
+
+		std::string getName() const
+		{ return getUdmaBufName(num); }
+
+	private:
+		int num;
+	};
+
+	static HostDmaRamAllocator&
+	getAllocator(int num = 0);
+
+private:
+	static std::map<int, std::unique_ptr<HostDmaRamAllocator>> allocators;
+};
+
 } // namespace villas
diff --git a/fpga/lib/common/memory.cpp b/fpga/lib/common/memory.cpp
index c6d5b386e..3a44ab63e 100644
--- a/fpga/lib/common/memory.cpp
+++ b/fpga/lib/common/memory.cpp
@@ -1,6 +1,9 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
+#include <sstream>
+#include <fstream>
+
 #include "memory.hpp"
 
 namespace villas {
@@ -140,4 +143,122 @@ HostRam::HostRamAllocator::HostRamAllocator() :
 	};
 }
 
+
+std::map<int, std::unique_ptr<HostDmaRam::HostDmaRamAllocator>>
+HostDmaRam::allocators;
+
+HostDmaRam::HostDmaRamAllocator::HostDmaRamAllocator(int num) :
+    LinearAllocator(MemoryManager::get().getOrCreateAddressSpace(getUdmaBufName(num)), getUdmaBufBufSize(num)),
+    num(num)
+{
+	auto& mm = MemoryManager::get();
+	logger = loggerGetOrCreate(getName());
+
+	if(getSize() == 0) {
+		logger->error("Zero-sized DMA buffer not supported, is the kernel module loaded?");
+		throw std::bad_alloc();
+	}
+
+	const uintptr_t base = getUdmaBufPhysAddr(num);
+
+	mm.createMapping(base, 0, getSize(), getName() + "-PCI",
+	                 mm.getPciAddressSpace(), getAddrSpaceId());
+
+	const auto bufPath = std::string("/dev/") + getUdmaBufName(num);
+	const int bufFd = open(bufPath.c_str(), O_RDWR | O_SYNC);
+	if(bufFd != -1) {
+		void* buf = mmap(nullptr, getSize(), PROT_READ|PROT_WRITE, MAP_SHARED, bufFd, 0);
+		close(bufFd);
+
+		if(buf != MAP_FAILED) {
+			mm.createMapping(reinterpret_cast<uintptr_t>(buf), 0, getSize(),
+			                 getName() + "-VA",
+			                 mm.getProcessAddressSpace(), getAddrSpaceId());
+		} else {
+			logger->warn("Cannot map {}", bufPath);
+		}
+	} else {
+		logger->warn("Cannot open {}", bufPath);
+	}
+
+	logger->info("Mapped {} of size {} bytes", bufPath, getSize());
+}
+
+HostDmaRam::HostDmaRamAllocator::~HostDmaRamAllocator()
+{
+	auto& mm = MemoryManager::get();
+
+	void* baseVirt;
+	try {
+		auto translation = mm.getTranslationFromProcess(getAddrSpaceId());
+		baseVirt = reinterpret_cast<void*>(translation.getLocalAddr(0));
+	} catch(const std::out_of_range&) {
+		// not mapped, nothing to do
+		return;
+	}
+
+	logger->debug("Unmapping {}", getName());
+
+	// try to unmap it
+	if(::munmap(baseVirt, getSize()) != 0) {
+		logger->warn("munmap() failed for {:p} of size {:#x}",
+		             baseVirt, getSize());
+	}
+}
+
+std::string
+HostDmaRam::getUdmaBufName(int num)
+{
+	std::stringstream name;
+	name << "udmabuf" << num;
+
+	return name.str();
+}
+
+std::string
+HostDmaRam::getUdmaBufBasePath(int num)
+{
+	std::stringstream path;
+	path << "/sys/class/udmabuf/udmabuf" << num << "/";
+	return path.str();
+}
+
+size_t
+HostDmaRam::getUdmaBufBufSize(int num)
+{
+	std::fstream s(getUdmaBufBasePath(num) + "size", s.in);
+	if(s.is_open()) {
+		std::string line;
+		if(std::getline(s, line)) {
+			return std::strtoul(line.c_str(), nullptr, 10);
+		}
+	}
+
+	return 0;
+}
+
+uintptr_t
+HostDmaRam::getUdmaBufPhysAddr(int num)
+{
+	std::fstream s(getUdmaBufBasePath(num) + "phys_addr", s.in);
+	if(s.is_open()) {
+		std::string line;
+		if(std::getline(s, line)) {
+			return std::strtoul(line.c_str(), nullptr, 16);
+		}
+	}
+
+	return UINTPTR_MAX;
+}
+
+HostDmaRam::HostDmaRamAllocator&HostDmaRam::getAllocator(int num)
+{
+	auto& allocator = allocators[num];
+	if(not allocator) {
+		allocator = std::make_unique<HostDmaRamAllocator>(num);
+	}
+
+	return *allocator;
+}
+
 } // namespace villas
diff --git a/fpga/thirdparty/udmabuf b/fpga/thirdparty/udmabuf
new file mode 160000
index 000000000..65762ca33
--- /dev/null
+++ b/fpga/thirdparty/udmabuf
@@ -0,0 +1 @@
+Subproject commit 65762ca3333cb230e3b80ecf4cb5e605390474f2

From e819829560c10ab442bbd6c6e388c449cfdf5bd2 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 14:48:49 +0200
Subject: [PATCH 19/31] directed-graph: add dumping to dot-file (graphviz)

---
 fpga/include/villas/directed_graph.hpp | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/fpga/include/villas/directed_graph.hpp b/fpga/include/villas/directed_graph.hpp
index 0af3abf36..c2b92674a 100644
--- a/fpga/include/villas/directed_graph.hpp
+++ b/fpga/include/villas/directed_graph.hpp
@@ -5,6 +5,7 @@
 #include <memory>
 #include <sstream>
 #include <string>
+#include <fstream>
 #include <stdexcept>
 #include <algorithm>
 
@@ -272,7 +273,7 @@ public:
 		return false;
 	}
 
-	void dump()
+	void dump(const std::string& fileName = "")
 	{
 		logger->info("Vertices:");
 		for(auto& v : vertices) {
@@ -287,11 +288,29 @@ public:
 			logger->info("  {} connected to: {}", *vertex, ssEdges.str());
 		}
 
+		std::fstream s(fileName, s.out | s.trunc);
+		if(s.is_open()) {
+			s << "digraph memgraph {" << std::endl;
+		}
+
 		logger->info("Edges:");
 		for(auto& e : edges) {
 			auto& edge = e.second;
 
 			logger->info("  {}: {} -> {}", *edge, edge->from, edge->to);
+			if(s.is_open()) {
+				auto from = getVertex(edge->from);
+				auto to = getVertex(edge->to);
+
+				s << std::dec;
+				s << "  \"" << *from << "\" -> \"" << *to << "\""
+				  << " [label=\"" << *edge << "\"];" << std::endl;
+			}
+		}
+
+		if(s.is_open()) {
+			s << "}" << std::endl;
+			s.close();
 		}
 	}
 

From 2bfb9e24500bcb8473f1133efa7dd913fadef137 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 15:12:17 +0200
Subject: [PATCH 20/31] common/memory: expose method to dump memory graph to
 file

---
 fpga/include/villas/memory_manager.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fpga/include/villas/memory_manager.hpp b/fpga/include/villas/memory_manager.hpp
index 387f5c0a2..555baa694 100644
--- a/fpga/include/villas/memory_manager.hpp
+++ b/fpga/include/villas/memory_manager.hpp
@@ -218,6 +218,9 @@ public:
 	dump()
 	{ memoryGraph.dump(); }
 
+	void
+	dumpToFile(const std::string& fileName)
+	{ memoryGraph.dump(fileName); }
 
 private:
 	/// Convert a Mapping to MemoryTranslation for calculations

From ad820a3618c819463b89f4fe40a710edc4081694 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 15:11:47 +0200
Subject: [PATCH 21/31] kernel/pci: parse BAR regions

---
 fpga/include/villas/kernel/pci.h | 10 +++++
 fpga/lib/kernel/pci.c            | 67 ++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/fpga/include/villas/kernel/pci.h b/fpga/include/villas/kernel/pci.h
index 10fd86ef6..3cd9565ed 100644
--- a/fpga/include/villas/kernel/pci.h
+++ b/fpga/include/villas/kernel/pci.h
@@ -9,6 +9,7 @@
 
 #pragma once
 
+#include <stdint.h>
 #include "list.h"
 
 #define PCI_SLOT(devfn)		(((devfn) >> 3) & 0x1f)
@@ -33,6 +34,13 @@ struct pci_device {
 	} slot;			/**< Bus, Device, Function (BDF) */
 };
 
+struct pci_region {
+	int num;
+	uintptr_t start;
+	uintptr_t end;
+	unsigned long long flags;
+};
+
 struct pci {
 	struct list devices; /**< List of available PCI devices in the system (struct pci_device) */
 };
@@ -66,6 +74,8 @@ int pci_attach_driver(const struct pci_device *d, const char *driver);
 /** Return the IOMMU group of this PCI device or -1 if the device is not in a group. */
 int pci_get_iommu_group(const struct pci_device *d);
 
+size_t pci_get_regions(const struct pci_device *d, struct pci_region** regions);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/fpga/lib/kernel/pci.c b/fpga/lib/kernel/pci.c
index 1f7336742..f920ea6d5 100644
--- a/fpga/lib/kernel/pci.c
+++ b/fpga/lib/kernel/pci.c
@@ -254,6 +254,73 @@ struct pci_device * pci_lookup_device(struct pci *p, struct pci_device *f)
 	return list_search(&p->devices, (cmp_cb_t) pci_device_compare, (void *) f);
 }
 
+size_t pci_get_regions(const struct pci_device *d, struct pci_region** regions)
+{
+	FILE* f;
+	char sysfs[1024];
+
+	assert(regions != NULL);
+
+	snprintf(sysfs, sizeof(sysfs), "%s/bus/pci/devices/%04x:%02x:%02x.%x/resource",
+	         SYSFS_PATH, d->slot.domain, d->slot.bus, d->slot.device, d->slot.function);
+
+	f = fopen(sysfs, "r");
+	if (!f)
+		serror("Failed to open resource mapping %s", sysfs);
+
+	struct pci_region _regions[8];
+	struct pci_region* cur_region = _regions;
+	size_t valid_regions = 0;
+
+	ssize_t bytesRead;
+	char* line = NULL;
+	size_t len = 0;
+
+	int region = 0;
+	// cap to 8 regions, just because we don't know how many may exist
+	while(region < 8 && (bytesRead = getline(&line, &len, f)) != -1) {
+		unsigned long long tokens[3];
+		char* s = line;
+		for(int i = 0; i < 3; i++) {
+			char* end;
+			tokens[i] = strtoull(s, &end, 16);
+			if(s == end) {
+				printf("Error parsing line %d of %s\n", region + 1, sysfs);
+				tokens[0] = tokens[1] = 0; // mark invalid
+				break;
+			}
+			s = end;
+		}
+
+		free(line);
+
+		// required for getline() to allocate a new buffer on the next iteration
+		line = NULL;
+		len = 0;
+
+		if(tokens[0] != tokens[1]) {
+			// this is a valid region
+			cur_region->num = region;
+			cur_region->start = tokens[0];
+			cur_region->end = tokens[1];
+			cur_region->flags = tokens[2];
+			cur_region++;
+			valid_regions++;
+		}
+
+		region++;
+	}
+
+	if(valid_regions > 0) {
+		const size_t len = valid_regions * sizeof (struct pci_region);
+		*regions = malloc(len);
+		memcpy(*regions, _regions, len);
+	}
+
+	return valid_regions;
+}
+
+
 int pci_get_driver(const struct pci_device *d, char *buf, size_t buflen)
 {
 	int ret;

From c818c242f3181cac4e55a0026cff6de2c752807d Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 15:14:40 +0200
Subject: [PATCH 22/31] kernel/pci: fix unitialized memory

---
 fpga/lib/kernel/pci.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fpga/lib/kernel/pci.c b/fpga/lib/kernel/pci.c
index f920ea6d5..0dbdf08a4 100644
--- a/fpga/lib/kernel/pci.c
+++ b/fpga/lib/kernel/pci.c
@@ -325,6 +325,7 @@ int pci_get_driver(const struct pci_device *d, char *buf, size_t buflen)
 {
 	int ret;
 	char sysfs[1024], syml[1024];
+	memset(syml, 0, sizeof(syml));
 
 	snprintf(sysfs, sizeof(sysfs), "%s/bus/pci/devices/%04x:%02x:%02x.%x/driver", SYSFS_PATH,
 		d->slot.domain, d->slot.bus, d->slot.device, d->slot.function);
@@ -372,6 +373,7 @@ int pci_get_iommu_group(const struct pci_device *d)
 {
 	int ret;
 	char *group, link[1024], sysfs[1024];
+	memset(link, 0, sizeof(link));
 
 	snprintf(sysfs, sizeof(sysfs), "%s/bus/pci/devices/%04x:%02x:%02x.%x/iommu_group", SYSFS_PATH,
 		d->slot.domain, d->slot.bus, d->slot.device, d->slot.function);

From 8f3833bc73a21b25ecc5e388adb678a6e845a094 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 17:18:05 +0200
Subject: [PATCH 23/31] ips/dma: rename pingpong to memcpy and always connect
 loopback

---
 fpga/include/villas/fpga/ips/dma.hpp | 2 +-
 fpga/lib/ips/dma.cpp                 | 8 +++++++-
 fpga/tests/dma.cpp                   | 2 +-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/fpga/include/villas/fpga/ips/dma.hpp b/fpga/include/villas/fpga/ips/dma.hpp
index 41a1fe115..d8b609794 100644
--- a/fpga/include/villas/fpga/ips/dma.hpp
+++ b/fpga/include/villas/fpga/ips/dma.hpp
@@ -55,7 +55,7 @@ public:
 	bool readComplete()
 	{ return hasScatterGather() ? readCompleteSG() : readCompleteSimple(); }
 
-	bool pingPong(const MemoryBlock& src, const MemoryBlock& dst, size_t len);
+	bool memcpy(const MemoryBlock& src, const MemoryBlock& dst, size_t len);
 
 	inline bool
 	hasScatterGather() const
diff --git a/fpga/lib/ips/dma.cpp b/fpga/lib/ips/dma.cpp
index 3a5878441..ed9f0a888 100644
--- a/fpga/lib/ips/dma.cpp
+++ b/fpga/lib/ips/dma.cpp
@@ -132,8 +132,14 @@ Dma::reset()
 
 
 bool
-Dma::pingPong(const MemoryBlock& src, const MemoryBlock& dst, size_t len)
+Dma::memcpy(const MemoryBlock& src, const MemoryBlock& dst, size_t len)
 {
+	if(len == 0)
+		return true;
+
+	if(not connectLoopback())
+		return false;
+
 	if(this->read(dst, len) == 0)
 		return false;
 
diff --git a/fpga/tests/dma.cpp b/fpga/tests/dma.cpp
index 63c1c0470..cb6b722f6 100644
--- a/fpga/tests/dma.cpp
+++ b/fpga/tests/dma.cpp
@@ -55,7 +55,7 @@ Test(fpga, dma, .description = "DMA")
 		cr_assert(len == lenRandom, "Failed to get random data");
 
 		/* Start transfer */
-		cr_assert(dma.pingPong(src.getMemoryBlock(), dst.getMemoryBlock(), len),
+		cr_assert(dma.memcpy(src.getMemoryBlock(), dst.getMemoryBlock(), len),
 		          "DMA ping pong failed");
 
 		/* Compare data */

From 364b13715664ee801c04701b82473a1718d66a3f Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 17:47:43 +0200
Subject: [PATCH 24/31] fpga/card: make pci device a class member (needed
 later)

---
 fpga/include/villas/fpga/card.hpp | 3 ++-
 fpga/lib/card.cpp                 | 4 +---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/fpga/include/villas/fpga/card.hpp b/fpga/include/villas/fpga/card.hpp
index f9d78151a..d4b4b5cc2 100644
--- a/fpga/include/villas/fpga/card.hpp
+++ b/fpga/include/villas/fpga/card.hpp
@@ -98,8 +98,9 @@ public:	// TODO: make this private
 
 	std::string name;			/**< The name of the FPGA card */
 
-	struct pci *pci;
+	struct pci* pci;
 	struct pci_device filter;		/**< Filter for PCI device. */
+	struct pci_device* pdev;		/**< PCI device handle */
 
 	/// The VFIO container that this card is part of
 	std::shared_ptr<VfioContainer> vfioContainer;
diff --git a/fpga/lib/card.cpp b/fpga/lib/card.cpp
index fb7369823..4581d7482 100644
--- a/fpga/lib/card.cpp
+++ b/fpga/lib/card.cpp
@@ -203,10 +203,8 @@ PCIeCard::mapMemoryBlock(const MemoryBlock& block)
 
 
 bool
-fpga::PCIeCard::init()
+PCIeCard::init()
 {
-	struct pci_device *pdev;
-
 	auto& mm = MemoryManager::get();
 	logger = getLogger();
 

From 89b5169a6ec4846f20a8794ac05db86ac4537c63 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 17:41:40 +0200
Subject: [PATCH 25/31] ips/pcie: parse AXI/PCI BARs and create mappings
 to/from PCIe address space

This is used for translations that don't use VFIO which used to bridge
the PCIe address space by creating direct mappings from process VA to
the FPGA. When we want to communicate directly via PCIe without the
involvment of the CPU/VFIO, we need the proper translations that are
configured in the FPGA hardware.
---
 fpga/include/villas/fpga/ips/pcie.hpp | 15 +++++
 fpga/lib/ips/pcie.cpp                 | 96 +++++++++++++++++++++++++++
 2 files changed, 111 insertions(+)

diff --git a/fpga/include/villas/fpga/ips/pcie.hpp b/fpga/include/villas/fpga/ips/pcie.hpp
index 28b01c5aa..ebce36178 100644
--- a/fpga/include/villas/fpga/ips/pcie.hpp
+++ b/fpga/include/villas/fpga/ips/pcie.hpp
@@ -52,6 +52,19 @@ public:
 private:
 	static constexpr char axiInterface[] = "M_AXI";
 	static constexpr char pcieMemory[] = "BAR0";
+
+	struct AxiBar {
+		uintptr_t base;
+		size_t size;
+		uintptr_t translation;
+	};
+
+	struct PciBar {
+		uintptr_t translation;
+	};
+
+	std::map<std::string, AxiBar> axiToPcieTranslations;
+	std::map<std::string, PciBar> pcieToAxiTranslations;
 };
 
 
@@ -64,6 +77,8 @@ public:
 	getCompatibleVlnvString()
 	{ return "xilinx.com:ip:axi_pcie:"; }
 
+	bool configureJson(IpCore& ip, json_t *json_ip);
+
 	IpCore* create()
 	{ return new AxiPciExpressBridge; }
 
diff --git a/fpga/lib/ips/pcie.cpp b/fpga/lib/ips/pcie.cpp
index 59174318b..3b273bbd8 100644
--- a/fpga/lib/ips/pcie.cpp
+++ b/fpga/lib/ips/pcie.cpp
@@ -63,6 +63,102 @@ AxiPciExpressBridge::init()
 	card->addrSpaceIdDeviceToHost =
 	        mm.getOrCreateAddressSpace(addrSpaceNameDeviceToHost);
 
+	auto pciAddrSpaceId = mm.getPciAddressSpace();
+
+	struct pci_region* pci_regions = nullptr;
+	size_t num_regions = pci_get_regions(card->pdev, &pci_regions);
+
+	for(size_t i = 0; i < num_regions; i++) {
+		const size_t region_size = pci_regions[i].end - pci_regions[i].start + 1;
+
+		char barName[] = "BARx";
+		barName[3] = '0' + pci_regions[i].num;
+		auto pciBar = pcieToAxiTranslations.at(barName);
+
+
+		logger->info("PCI-BAR{}: bus addr={:#x} size={:#x}",
+		             pci_regions[i].num, pci_regions[i].start, region_size);
+		logger->info("PCI-BAR{}: AXI translation offset {:#x}",
+		             i, pciBar.translation);
+
+		mm.createMapping(pci_regions[i].start, pciBar.translation, region_size,
+		                 std::string("PCI-") + barName,
+		                 pciAddrSpaceId, card->addrSpaceIdHostToDevice);
+
+	}
+
+	if(pci_regions != nullptr) {
+		logger->debug("freeing pci regions");
+		free(pci_regions);
+	}
+
+
+	for(auto& [barName, axiBar] : axiToPcieTranslations) {
+		logger->info("AXI-{}: bus addr={:#x} size={:#x}",
+		             barName, axiBar.base, axiBar.size);
+		logger->info("AXI-{}: PCI translation offset: {:#x}",
+		             barName, axiBar.translation);
+
+		auto barXAddrSpaceName = mm.getSlaveAddrSpaceName(getInstanceName(), barName);
+		auto barXAddrSpaceId = mm.getOrCreateAddressSpace(barXAddrSpaceName);
+
+		// base is already incorporated into mapping of each IP by Vivado, so
+		// the mapping src has to be 0
+		mm.createMapping(0, axiBar.translation, axiBar.size,
+		                 std::string("AXI-") + barName,
+		                 barXAddrSpaceId, pciAddrSpaceId);
+	}
+
+	return true;
+}
+
+bool
+AxiPciExpressBridgeFactory::configureJson(IpCore& ip, json_t* json_ip)
+{
+	auto logger = getLogger();
+	auto& pcie = reinterpret_cast<AxiPciExpressBridge&>(ip);
+
+	for(auto barType : std::list<std::string>{"axi_bars", "pcie_bars"}) {
+		json_t* json_bars = json_object_get(json_ip, barType.c_str());
+		if(not json_is_object(json_bars)) {
+			return false;
+		}
+
+		json_t* json_bar;
+		const char* bar_name;
+		json_object_foreach(json_bars, bar_name, json_bar) {
+			unsigned int translation;
+			int ret = json_unpack(json_bar, "{ s: i }", "translation", &translation);
+			if(ret != 0) {
+				logger->error("Cannot parse {}/{}", barType, bar_name);
+				return false;
+			}
+
+			if(barType == "axi_bars") {
+				json_int_t base, high, size;
+				int ret = json_unpack(json_bar, "{ s: I, s: I, s: I }",
+				                      "baseaddr", &base,
+				                      "highaddr", &high,
+				                      "size", &size);
+				if(ret != 0) {
+					logger->error("Cannot parse {}/{}", barType, bar_name);
+					return false;
+				}
+
+				pcie.axiToPcieTranslations[bar_name] = {
+				    .base = static_cast<uintptr_t>(base),
+				    .size = static_cast<size_t>(size),
+				    .translation = translation
+				};
+
+			} else {
+				pcie.pcieToAxiTranslations[bar_name] = {
+				    .translation = translation
+				};
+			}
+		}
+	}
+
 	return true;
 }
 

From f644a9faa8437bd80971e5f2af81c8696d01b97a Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 17:50:48 +0200
Subject: [PATCH 26/31] ips/pcie: move BAR0 mapping from card into PCIe IP

---
 fpga/lib/card.cpp     | 30 ------------------------------
 fpga/lib/ips/pcie.cpp | 23 ++++++++++++++++-------
 2 files changed, 16 insertions(+), 37 deletions(-)

diff --git a/fpga/lib/card.cpp b/fpga/lib/card.cpp
index 4581d7482..de32310f5 100644
--- a/fpga/lib/card.cpp
+++ b/fpga/lib/card.cpp
@@ -205,7 +205,6 @@ PCIeCard::mapMemoryBlock(const MemoryBlock& block)
 bool
 PCIeCard::init()
 {
-	auto& mm = MemoryManager::get();
 	logger = getLogger();
 
 	logger->info("Initializing FPGA card {}", name);
@@ -221,41 +220,12 @@ PCIeCard::init()
 	VfioDevice& device = vfioContainer->attachDevice(pdev);
 	this->vfioDevice = &device;
 
-
 	/* Enable memory access and PCI bus mastering for DMA */
 	if (not device.pciEnable()) {
 		logger->error("Failed to enable PCI device");
 		return false;
 	}
 
-	/* Map PCIe BAR */
-	const void* bar0_mapped = vfioDevice->regionMap(VFIO_PCI_BAR0_REGION_INDEX);
-	if (bar0_mapped == MAP_FAILED) {
-		logger->error("Failed to mmap() BAR0");
-		return false;
-	}
-
-	// determine size of BAR0 region
-	const size_t bar0_size = vfioDevice->regionGetSize(VFIO_PCI_BAR0_REGION_INDEX);
-
-
-	/* Link mapped BAR0 to global memory graph */
-
-	// get the address space of the current application
-	const auto villasAddrSpace = mm.getProcessAddressSpace();
-
-	// get the address space for the PCIe proxy we use with VFIO
-	const auto cardPCIeAddrSpaceName = mm.getMasterAddrSpaceName(name, "PCIe");
-
-	// create a new address space for this FPGA card
-	addrSpaceIdHostToDevice = mm.getOrCreateAddressSpace(cardPCIeAddrSpaceName);
-
-	// create a mapping from our address space to the FPGA card via vfio
-	mm.createMapping(reinterpret_cast<uintptr_t>(bar0_mapped),
-	                                   0, bar0_size, "VFIO_map",
-	                                   villasAddrSpace, addrSpaceIdHostToDevice);
-
-
 	/* Reset system? */
 	if (do_reset) {
 		/* Reset / detect PCI device */
diff --git a/fpga/lib/ips/pcie.cpp b/fpga/lib/ips/pcie.cpp
index 3b273bbd8..c058056ff 100644
--- a/fpga/lib/ips/pcie.cpp
+++ b/fpga/lib/ips/pcie.cpp
@@ -42,14 +42,23 @@ AxiPciExpressBridge::init()
 
 	// Throw an exception if the is no bus master interface and thus no
 	// address space we can use for translation -> error
-	const MemoryManager::AddressSpaceId myAddrSpaceid =
-	        busMasterInterfaces.at(axiInterface);
+	card->addrSpaceIdHostToDevice = busMasterInterfaces.at(axiInterface);
 
-	// Create an identity mapping from the FPGA card to this IP as an entry
-	// point to all other IPs in the FPGA, because Vivado will generate a
-	// memory view for this bridge that can see all others.
-	MemoryManager::get().createMapping(0x00, 0x00, SIZE_MAX, "PCIeBridge",
-	                                   card->addrSpaceIdHostToDevice, myAddrSpaceid);
+	/* Map PCIe BAR0 via VFIO */
+	const void* bar0_mapped = card->vfioDevice->regionMap(VFIO_PCI_BAR0_REGION_INDEX);
+	if (bar0_mapped == MAP_FAILED) {
+		logger->error("Failed to mmap() BAR0");
+		return false;
+	}
+
+	// determine size of BAR0 region
+	const size_t bar0_size = card->vfioDevice->regionGetSize(VFIO_PCI_BAR0_REGION_INDEX);
+
+	// create a mapping from process address space to the FPGA card via vfio
+	mm.createMapping(reinterpret_cast<uintptr_t>(bar0_mapped),
+	                 0, bar0_size, "VFIO-H2D",
+	                 mm.getProcessAddressSpace(),
+	                 card->addrSpaceIdHostToDevice);
 
 
 	/* Make PCIe (IOVA) address space available to FPGA via BAR0 */

From f823dde0f49d476b8ffd185ae98b86a6b55758fe Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 17:52:03 +0200
Subject: [PATCH 27/31] card: don't try to create a VFIO mapping if IOMMU is
 disabled

In this case, VFIO cannot create DMA mappings.
---
 fpga/lib/card.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fpga/lib/card.cpp b/fpga/lib/card.cpp
index de32310f5..9df391438 100644
--- a/fpga/lib/card.cpp
+++ b/fpga/lib/card.cpp
@@ -165,6 +165,11 @@ PCIeCard::lookupIp(const Vlnv& vlnv) const
 bool
 PCIeCard::mapMemoryBlock(const MemoryBlock& block)
 {
+	if(not vfioContainer->isIommuEnabled()) {
+		logger->warn("VFIO mapping not supported without IOMMU");
+		return false;
+	}
+
 	auto& mm = MemoryManager::get();
 	const auto& addrSpaceId = block.getAddrSpaceId();
 
@@ -175,7 +180,6 @@ PCIeCard::mapMemoryBlock(const MemoryBlock& block)
 		logger->debug("Create VFIO mapping for {}", addrSpaceId);
 	}
 
-
 	auto translationFromProcess = mm.getTranslationFromProcess(addrSpaceId);
 	uintptr_t processBaseAddr = translationFromProcess.getLocalAddr(0);
 	uintptr_t iovaAddr = vfioContainer->memoryMap(processBaseAddr,
@@ -188,10 +192,8 @@ PCIeCard::mapMemoryBlock(const MemoryBlock& block)
 		return false;
 	}
 
-
-
 	mm.createMapping(iovaAddr, 0, block.getSize(),
-	                 "vfio",
+	                 "VFIO-D2H",
 	                 this->addrSpaceIdDeviceToHost,
 	                 addrSpaceId);
 

From 7dcdfaccd9793cad630390fbdd6d442869f0702a Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 17:53:06 +0200
Subject: [PATCH 28/31] ips/dma: let user deal with making memory accessible to
 DMA

It is probably too costly to do (and verify) it on every read
or write. Furthermore, the user knows better how to make a certain
memory available to the DMA.
---
 fpga/include/villas/fpga/ip.hpp      |  4 ++
 fpga/include/villas/fpga/ips/dma.hpp |  4 ++
 fpga/lib/ips/dma.cpp                 | 60 ++++++++++++++++++++++------
 3 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/fpga/include/villas/fpga/ip.hpp b/fpga/include/villas/fpga/ip.hpp
index f565aa4a2..1950a8ed2 100644
--- a/fpga/include/villas/fpga/ip.hpp
+++ b/fpga/include/villas/fpga/ip.hpp
@@ -193,6 +193,10 @@ protected:
 	InterruptController*
 	getInterruptController(const std::string& interruptName) const;
 
+	MemoryManager::AddressSpaceId
+	getMasterAddrSpaceByInterface(const std::string& masterInterfaceName) const
+	{ return busMasterInterfaces.at(masterInterfaceName); }
+
 protected:
 	struct IrqPort {
 		int num;
diff --git a/fpga/include/villas/fpga/ips/dma.hpp b/fpga/include/villas/fpga/ips/dma.hpp
index d8b609794..88530dbe3 100644
--- a/fpga/include/villas/fpga/ips/dma.hpp
+++ b/fpga/include/villas/fpga/ips/dma.hpp
@@ -57,6 +57,8 @@ public:
 
 	bool memcpy(const MemoryBlock& src, const MemoryBlock& dst, size_t len);
 
+	bool makeAccesibleFromVA(const MemoryBlock& mem);
+
 	inline bool
 	hasScatterGather() const
 	{ return hasSG; }
@@ -72,6 +74,8 @@ private:
 	bool writeCompleteSimple();
 	bool readCompleteSimple();
 
+	bool isMemoryBlockAccesible(const MemoryBlock& mem, const std::string& interface);
+
 private:
 	static constexpr char registerMemory[] = "Reg";
 
diff --git a/fpga/lib/ips/dma.cpp b/fpga/lib/ips/dma.cpp
index ed9f0a888..3d3bf0e70 100644
--- a/fpga/lib/ips/dma.cpp
+++ b/fpga/lib/ips/dma.cpp
@@ -159,17 +159,14 @@ Dma::memcpy(const MemoryBlock& src, const MemoryBlock& dst, size_t len)
 size_t
 Dma::write(const MemoryBlock& mem, size_t len)
 {
-	// make sure memory is reachable
-	if(not card->mapMemoryBlock(mem)) {
-		logger->error("Memory not accessible by DMA");
-		return 0;
-	}
-
 	auto& mm = MemoryManager::get();
+
+	// user has to make sure that memory is accessible, otherwise this will throw
 	auto translation = mm.getTranslation(busMasterInterfaces[mm2sInterface],
 	                                     mem.getAddrSpaceId());
 	const void* buf = reinterpret_cast<void*>(translation.getLocalAddr(0));
 
+	logger->debug("Write to address: {:p}", buf);
 	return hasScatterGather() ? writeSG(buf, len) : writeSimple(buf, len);
 }
 
@@ -177,17 +174,14 @@ Dma::write(const MemoryBlock& mem, size_t len)
 size_t
 Dma::read(const MemoryBlock& mem, size_t len)
 {
-	// make sure memory is reachable
-	if(not card->mapMemoryBlock(mem)) {
-		logger->error("Memory not accessible by DMA");
-		return 0;
-	}
-
 	auto& mm = MemoryManager::get();
+
+	// user has to make sure that memory is accessible, otherwise this will throw
 	auto translation = mm.getTranslation(busMasterInterfaces[s2mmInterface],
 	                                     mem.getAddrSpaceId());
 	void* buf = reinterpret_cast<void*>(translation.getLocalAddr(0));
 
+	logger->debug("Read from address: {:p}", buf);
 	return hasScatterGather() ? readSG(buf, len) : readSimple(buf, len);
 }
 
@@ -356,6 +350,48 @@ Dma::readCompleteSimple()
 }
 
 
+bool
+Dma::makeAccesibleFromVA(const MemoryBlock& mem)
+{
+	// only symmetric mapping supported currently
+	if(isMemoryBlockAccesible(mem, s2mmInterface) and
+	   isMemoryBlockAccesible(mem, mm2sInterface)) {
+		return true;
+	}
+
+	// try mapping via FPGA-card (VFIO)
+	if(not card->mapMemoryBlock(mem)) {
+		logger->error("Memory not accessible by DMA");
+		return false;
+	}
+
+	// sanity-check if mapping worked, this shouldn't be neccessary
+	if(not isMemoryBlockAccesible(mem, s2mmInterface) or
+	   not isMemoryBlockAccesible(mem, mm2sInterface)) {
+		logger->error("Mapping memory via card didn't work, but reported success?!");
+		return false;
+	}
+
+	return true;
+}
+
+
+bool
+Dma::isMemoryBlockAccesible(const MemoryBlock& mem, const std::string& interface)
+{
+	auto& mm = MemoryManager::get();
+
+	try {
+		mm.findPath(getMasterAddrSpaceByInterface(interface), mem.getAddrSpaceId());
+	} catch(const std::out_of_range&) {
+		// not (yet) accessible
+		return false;
+	}
+
+	return true;
+}
+
+
 } // namespace ip
 } // namespace fpga
 } // namespace villas

From 24db7ea1c0b39fc1ecfc0ff7aa1472450c6b3182 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 17:54:37 +0200
Subject: [PATCH 29/31] tests/dma: update to current progress

---
 fpga/tests/dma.cpp | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/fpga/tests/dma.cpp b/fpga/tests/dma.cpp
index cb6b722f6..e7c462b26 100644
--- a/fpga/tests/dma.cpp
+++ b/fpga/tests/dma.cpp
@@ -26,38 +26,52 @@ Test(fpga, dma, .description = "DMA")
 
 		auto dma = reinterpret_cast<villas::fpga::ip::Dma&>(*ip);
 
-		if(not dma.connectLoopback()) {
+		if(not dma.loopbackPossible()) {
+			logger->info("Loopback test not possible for {}", *ip);
 			continue;
 		}
 
 		count++;
 
-		if(not dma.loopbackPossible()) {
-			logger->info("Loopback test not possible for {}", *ip);
-			continue;
-		}
+
+		// Simple DMA can only transfer up to 4 kb due to PCIe page size burst
+		// limitation
+		size_t len = 4 * (1 << 10);
 
 		// find a block RAM IP to write to
 		auto bramIp = state.cards.front()->lookupIp(villas::fpga::Vlnv("xilinx.com:ip:axi_bram_ctrl:"));
 		auto bram = reinterpret_cast<villas::fpga::ip::Bram*>(bramIp);
 		cr_assert_not_null(bram, "Couldn't find BRAM");
 
-		// Simple DMA can only transfer up to 4 kb due to PCIe page size burst
-		// limitation
-		size_t len = 4 * (1 << 10);
 
 		/* Allocate memory to use with DMA */
-		auto src = villas::HostRam::getAllocator().allocate<char>(len);
-		auto dst = bram->getAllocator().allocate<char>(len);
+		auto src = villas::HostDmaRam::getAllocator().allocate<char>(len);
+		auto dst = villas::HostDmaRam::getAllocator().allocate<char>(len);
+
+		/* ... only works with IOMMU enabled currently */
+//		auto src = bram->getAllocator().allocate<char>(len);
+//		auto dst = bram->getAllocator().allocate<char>(len);
+
+		/* ... only works with IOMMU enabled currently */
+//		auto src = villas::HostRam::getAllocator().allocate<char>(len);
+//		auto dst = villas::HostRam::getAllocator().allocate<char>(len);
+
+		/* Make sure memory is accessible for DMA */
+		cr_assert(dma.makeAccesibleFromVA(src.getMemoryBlock()),
+		          "Source memory not accessible for DMA");
+		cr_assert(dma.makeAccesibleFromVA(dst.getMemoryBlock()),
+		          "Destination memory not accessible for DMA");
 
 		/* Get new random data */
 		const size_t lenRandom = read_random(&src, len);
 		cr_assert(len == lenRandom, "Failed to get random data");
 
+
 		/* Start transfer */
 		cr_assert(dma.memcpy(src.getMemoryBlock(), dst.getMemoryBlock(), len),
 		          "DMA ping pong failed");
 
+
 		/* Compare data */
 		cr_assert(memcmp(&src, &dst, len) == 0, "Data not equal");
 

From 13fd3f3c2a825d48e216eeba8d32ef6f53b0c00a Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Tue, 15 May 2018 17:35:45 +0200
Subject: [PATCH 30/31] gpu: implement basic GPU plugin that can do DMA to and
 from its memory

Using CUDA, memory can be allocated on the GPU and shared to peers on
the PCIe bus such as the FPGA. Furthermore, the DMA on the GPU can also
be used to read and write to/from other memory on the PCIe bus, such as
BRAM on the FPGA.
---
 fpga/include/villas/plugin.hpp      |   1 +
 fpga/lib/gpu/CMakeLists.txt         |  24 +-
 fpga/lib/gpu/gdrcopy                |   2 +-
 fpga/lib/gpu/gpu.cpp                | 467 ++++++++++++++++++++++++++++
 fpga/lib/gpu/include/villas/gpu.hpp |  80 +++++
 fpga/lib/gpu/kernels.cu             |  42 +++
 fpga/lib/gpu/kernels.hpp            |  16 +
 fpga/tests/CMakeLists.txt           |   1 +
 fpga/tests/gpu.cpp                  | 129 ++++++++
 9 files changed, 755 insertions(+), 7 deletions(-)
 create mode 100644 fpga/lib/gpu/kernels.cu
 create mode 100644 fpga/lib/gpu/kernels.hpp
 create mode 100644 fpga/tests/gpu.cpp

diff --git a/fpga/include/villas/plugin.hpp b/fpga/include/villas/plugin.hpp
index 35a4912ba..fd3fd3fc8 100644
--- a/fpga/include/villas/plugin.hpp
+++ b/fpga/include/villas/plugin.hpp
@@ -40,6 +40,7 @@ public:
 		Unknown,
 		FpgaIp,
 		FpgaCard,
+		Gpu
 	};
 
 	Plugin(Type type, const std::string& name);
diff --git a/fpga/lib/gpu/CMakeLists.txt b/fpga/lib/gpu/CMakeLists.txt
index 68c3993ea..c2dd78804 100644
--- a/fpga/lib/gpu/CMakeLists.txt
+++ b/fpga/lib/gpu/CMakeLists.txt
@@ -1,14 +1,26 @@
-cmake_minimum_required(VERSION 3.7)
+cmake_minimum_required(VERSION 3.8)
 
 project(villas-gpu
 	VERSION 1.0
 	DESCRIPTION "VILLASgpu"
-	LANGUAGES C CXX)
+	LANGUAGES C CXX CUDA)
 
-# fail if CUDA not found
-find_package(CUDA QUIET REQUIRED)
+add_subdirectory(gdrcopy)
 
-cuda_add_library(villas-gpu SHARED gpu.cpp)
+add_library(villas-gpu SHARED gpu.cpp kernels.cu)
+
+target_compile_options(villas-gpu PRIVATE -g)
+
+set_source_files_properties(gpu.cpp PROPERTIES
+	LANGUAGE CUDA)
+
+target_include_directories(villas-gpu PRIVATE /opt/cuda/include)
+
+target_link_libraries(villas-gpu
+	PRIVATE villas-common gdrapi cuda)
 
 target_include_directories(villas-gpu
-	PUBLIC ${CMAKE_CURRENT_LIST_DIR}/include)
+	PUBLIC
+	    ${CMAKE_CURRENT_LIST_DIR}/include
+	PRIVATE
+	    ${CMAKE_CURRENT_LIST_DIR})
diff --git a/fpga/lib/gpu/gdrcopy b/fpga/lib/gpu/gdrcopy
index 2b933176d..0441daa44 160000
--- a/fpga/lib/gpu/gdrcopy
+++ b/fpga/lib/gpu/gdrcopy
@@ -1 +1 @@
-Subproject commit 2b933176d0fd20f10bddfdf574a1d3229ca1ecdf
+Subproject commit 0441daa447b80260c4e11096f03e88f7be08bfa2
diff --git a/fpga/lib/gpu/gpu.cpp b/fpga/lib/gpu/gpu.cpp
index 4314fcb65..e8f7d58ec 100644
--- a/fpga/lib/gpu/gpu.cpp
+++ b/fpga/lib/gpu/gpu.cpp
@@ -1,7 +1,474 @@
+#include <cstdio>
+#include <cstdint>
+#include <sys/mman.h>
+
+#include <memory>
+#include <algorithm>
+
 #include <villas/gpu.hpp>
+#include <villas/log.hpp>
+#include <villas/kernel/pci.h>
+#include <memory_manager.hpp>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <gdrapi.h>
+
+#include "kernels.hpp"
 
 namespace villas {
 namespace gpu {
 
+static GpuFactory gpuFactory;
+
+GpuAllocator::GpuAllocator(Gpu& gpu) :
+    BaseAllocator(gpu.masterPciEAddrSpaceId),
+    gpu(gpu)
+{
+	free = [&](MemoryBlock* mem) {
+		cudaSetDevice(gpu.gpuId);
+		if(cudaFree(reinterpret_cast<void*>(mem->getOffset())) != cudaSuccess) {
+			logger->warn("cudaFree() failed for {:#x} of size {:#x}",
+			             mem->getOffset(), mem->getSize());
+		}
+
+		removeMemoryBlock(*mem);
+	};
+}
+
+std::string
+villas::gpu::GpuAllocator::getName() const
+{
+	std::stringstream name;
+	name << "GpuAlloc" << getAddrSpaceId();
+	return name.str();
+}
+
+
+GpuFactory::GpuFactory() :
+    Plugin(Plugin::Type::Gpu, "GPU")
+{
+	logger = loggerGetOrCreate("GpuFactory");
+}
+
+// required to be defined here for PIMPL to compile
+Gpu::~Gpu()
+{
+	auto& mm = MemoryManager::get();
+	mm.removeAddressSpace(masterPciEAddrSpaceId);
+}
+
+
+// we use PIMPL in order to hide gdrcopy types from the public header
+class Gpu::impl {
+public:
+	gdr_t gdr;
+	struct pci_device pdev;
+};
+
+std::string Gpu::getName() const
+{
+	cudaDeviceProp deviceProp;
+	if(cudaGetDeviceProperties(&deviceProp, gpuId) != cudaSuccess) {
+		// logger not yet availabe
+		loggerGetOrCreate("Gpu")->error("Cannot retrieve properties for GPU {}", gpuId);
+		throw std::exception();
+	}
+
+	std::stringstream name;
+	name << "gpu" << gpuId << "(" << deviceProp.name << ")";
+
+	return name.str();
+}
+
+bool Gpu::registerIoMemory(const MemoryBlock& mem)
+{
+	auto& mm = MemoryManager::get();
+	const auto pciAddrSpaceId = mm.getPciAddressSpace();
+
+	// Check if we need to map anything at all, maybe it's already reachable
+	try {
+		// TODO: there might already be a path through the graph, but there's no
+		// overlapping window, so this will fail badly!
+		auto translation = mm.getTranslation(masterPciEAddrSpaceId,
+		                                     mem.getAddrSpaceId());
+		if(translation.getSize() >= mem.getSize()) {
+			// there is already a sufficient path
+			logger->debug("Already mapped through another mapping");
+			return true;
+		} else {
+			logger->warn("There's already a mapping, but too small");
+		}
+	} catch(const std::out_of_range&) {
+		// not yet reachable, that's okay, proceed
+	}
+
+
+	// In order to register IO memory with CUDA, it has to be mapped to the VA
+	// space of the current process (requirement of CUDA API). Check this now.
+	MemoryManager::AddressSpaceId mappedBaseAddrSpaceId;
+	try {
+		auto path = mm.findPath(mm.getProcessAddressSpace(), mem.getAddrSpaceId());
+		// first node in path is the mapped memory space whose virtual address
+		// we need to hand to CUDA
+		mappedBaseAddrSpaceId = path.front();
+	} catch (const std::out_of_range&) {
+		logger->error("Memory not reachable from process, but required by CUDA");
+		return false;
+	}
+
+	// determine the base address of the mapped memory region needed by CUDA
+	const auto translationProcess = mm.getTranslationFromProcess(mappedBaseAddrSpaceId);
+	const uintptr_t baseAddrForProcess = translationProcess.getLocalAddr(0);
+
+
+	// Now check that the memory is also reachable via PCIe bus, otherwise GPU
+	// has no means to access it.
+	uintptr_t baseAddrOnPci;
+	size_t	sizeOnPci;
+	try {
+		auto translationPci = mm.getTranslation(pciAddrSpaceId,
+		                                        mappedBaseAddrSpaceId);
+		baseAddrOnPci = translationPci.getLocalAddr(0);
+		sizeOnPci = translationPci.getSize();
+	} catch(const std::out_of_range&) {
+		logger->error("Memory is not reachable via PCIe bus");
+		return false;
+	}
+
+	if(sizeOnPci < mem.getSize()) {
+		logger->warn("VA mapping of IO memory is too small: {:#x} instead of {:#x} bytes",
+		              sizeOnPci, mem.getSize());
+		logger->warn("If something later on fails or behaves strangely, this might be the cause!");
+	}
+
+
+	cudaSetDevice(gpuId);
+
+	auto baseAddrVA = reinterpret_cast<void*>(baseAddrForProcess);
+	if(cudaHostRegister(baseAddrVA, sizeOnPci, cudaHostRegisterIoMemory) != cudaSuccess) {
+		logger->error("Cannot register IO memory for block {}", mem.getAddrSpaceId());
+		return false;
+	}
+
+	void* devicePointer = nullptr;
+	if(cudaHostGetDevicePointer(&devicePointer, baseAddrVA, 0) != cudaSuccess) {
+		logger->error("Cannot retrieve device pointer for IO memory");
+		return false;
+	}
+
+	mm.createMapping(reinterpret_cast<uintptr_t>(devicePointer), baseAddrOnPci,
+	                 sizeOnPci, "CudaIoMem", masterPciEAddrSpaceId, pciAddrSpaceId);
+
+	return true;
+}
+
+bool
+Gpu::registerHostMemory(const MemoryBlock& mem)
+{
+	auto& mm = MemoryManager::get();
+
+	auto translation = mm.getTranslationFromProcess(mem.getAddrSpaceId());
+	auto localBase = reinterpret_cast<void*>(translation.getLocalAddr(0));
+
+	int ret = cudaHostRegister(localBase, mem.getSize(), 0);
+	if(ret != cudaSuccess) {
+		logger->error("Cannot register memory block {} addr={:p} size={:#x} to CUDA: ret={}",
+		              mem.getAddrSpaceId(), localBase, mem.getSize(), ret);
+		return false;
+	}
+
+	void* devicePointer = nullptr;
+	ret = cudaHostGetDevicePointer(&devicePointer, localBase, 0);
+	if(ret != cudaSuccess) {
+		logger->error("Cannot retrieve device pointer for IO memory: ret={}", ret);
+		return false;
+	}
+
+	mm.createMapping(reinterpret_cast<uintptr_t>(devicePointer), 0, mem.getSize(),
+	                 "CudaHostMem", masterPciEAddrSpaceId, mem.getAddrSpaceId());
+
+	return true;
+}
+
+bool Gpu::makeAccessibleToPCIeAndVA(const MemoryBlock& mem)
+{
+	if(pImpl->gdr == nullptr) {
+		logger->warn("GDRcopy not available");
+		return false;
+	}
+
+	auto& mm = MemoryManager::get();
+
+	try {
+		auto path = mm.findPath(masterPciEAddrSpaceId, mem.getAddrSpaceId());
+		// if first hop is the PCIe bus, we know that memory is off-GPU
+		if(path.front() == mm.getPciAddressSpace()) {
+			throw std::out_of_range("Memory block is outside of this GPU");
+		}
+
+	} catch (const std::out_of_range&) {
+		logger->error("Trying to map non-GPU memory block");
+		return false;
+	}
+
+	logger->debug("retrieve complete device pointer from point of view of GPU");
+	// retrieve complete device pointer from point of view of GPU
+	auto translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                     mem.getAddrSpaceId());
+	CUdeviceptr devptr = translation.getLocalAddr(0);
+
+	int ret;
+
+	// required to set this flag before mapping
+	unsigned int enable = 1;
+	ret = cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, devptr);
+	if(ret != CUDA_SUCCESS) {
+		logger->error("Cannot set pointer attributes on memory block {}: {}",
+		              mem.getAddrSpaceId(), ret);
+		return false;
+	}
+
+	gdr_mh_t mh;
+	ret = gdr_pin_buffer(pImpl->gdr, devptr, mem.getSize(), 0, 0, &mh);
+	if(ret != 0) {
+		logger->error("Cannot pin memory block {} via gdrcopy: {}",
+		              mem.getAddrSpaceId(), ret);
+		return false;
+	}
+
+	void* bar = nullptr;
+	ret = gdr_map(pImpl->gdr, mh, &bar, mem.getSize());
+	if(ret != 0) {
+		logger->error("Cannot map memory block {} via gdrcopy: {}",
+		              mem.getAddrSpaceId(), ret);
+		return false;
+	}
+
+	gdr_info_t info;
+	ret = gdr_get_info(pImpl->gdr, mh, &info);
+	if(ret != 0) {
+		logger->error("Cannot get info for mapping of memory block {}: {}",
+		              mem.getAddrSpaceId(), ret);
+		return false;
+	}
+
+	const uintptr_t offset = info.va - devptr;
+	const uintptr_t userPtr = reinterpret_cast<uintptr_t>(bar) + offset;
+
+	logger->debug("BAR ptr:          {:p}",  bar);
+	logger->debug("info.va:          {:#x}", info.va);
+	logger->debug("info.mapped_size: {:#x}", info.mapped_size);
+	logger->debug("info.page_size:   {:#x}", info.page_size);
+	logger->debug("offset:           {:#x}", offset);
+	logger->debug("user pointer:     {:#x}", userPtr);
+
+	// mapping to acceses memory block from process
+	mm.createMapping(userPtr, 0, info.mapped_size, "GDRcopy",
+	                 mm.getProcessAddressSpace(), mem.getAddrSpaceId());
+
+	// retrieve bus address
+	uint64_t addr[8];
+	ret = gdr_map_dma(pImpl->gdr, mh, 3, 0, 0, addr, 8);
+
+	for(int i = 0; i < ret; i++) {
+		logger->debug("DMA addr[{}]:      {:#x}", i, addr[i]);
+	}
+
+	if(ret != 1) {
+		logger->error("Only one DMA address per block supported at the moment");
+		return false;
+	}
+
+	// mapping to access memory block from peer devices via PCIe
+	mm.createMapping(addr[0], 0, mem.getSize(), "GDRcopyDMA",
+	                 mm.getPciAddressSpace(), mem.getAddrSpaceId());
+
+	return true;
+}
+
+bool
+Gpu::makeAccessibleFromPCIeOrHostRam(const MemoryBlock& mem)
+{
+	// Check which kind of memory this is and where it resides
+	// There are two possibilities:
+	//  - Host memory not managed by CUDA
+	//  - IO memory somewhere on the PCIe bus
+
+	auto& mm = MemoryManager::get();
+
+	bool isIoMemory = false;
+	try {
+		auto path = mm.findPath(mm.getPciAddressSpace(), mem.getAddrSpaceId());
+		isIoMemory = true;
+	} catch(const std::out_of_range&) {
+		// not reachable via PCI -> not IO memory
+	}
+
+	if(isIoMemory) {
+		logger->debug("Memory block {} is assumed to be IO memory",
+		              mem.getAddrSpaceId());
+
+		return registerIoMemory(mem);
+	} else {
+		logger->debug("Memory block {} is assumed to be non-CUDA host memory",
+		              mem.getAddrSpaceId());
+
+		return registerHostMemory(mem);
+	}
+}
+
+void Gpu::memcpySync(const MemoryBlock& src, const MemoryBlock& dst, size_t size)
+{
+	auto& mm = MemoryManager::get();
+
+	auto src_translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                         src.getAddrSpaceId());
+	const void* src_buf = reinterpret_cast<void*>(src_translation.getLocalAddr(0));
+
+	auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                         dst.getAddrSpaceId());
+	void* dst_buf = reinterpret_cast<void*>(dst_translation.getLocalAddr(0));
+
+	cudaSetDevice(gpuId);
+	cudaMemcpy(dst_buf, src_buf, size, cudaMemcpyDefault);
+}
+
+void Gpu::memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size)
+{
+	auto& mm = MemoryManager::get();
+
+	auto src_translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                         src.getAddrSpaceId());
+	auto src_buf = reinterpret_cast<uint8_t*>(src_translation.getLocalAddr(0));
+
+	auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                         dst.getAddrSpaceId());
+	auto dst_buf = reinterpret_cast<uint8_t*>(dst_translation.getLocalAddr(0));
+
+	cudaSetDevice(gpuId);
+	kernel_memcpy<<<1, 1>>>(dst_buf, src_buf, size);
+	cudaDeviceSynchronize();
+}
+
+
+std::unique_ptr<villas::MemoryBlock, villas::MemoryBlock::deallocator_fn>
+GpuAllocator::allocateBlock(size_t size)
+{
+	cudaSetDevice(gpu.gpuId);
+
+	void* addr;
+	if(cudaSuccess != cudaMalloc(&addr, size)) {
+		logger->error("cudaMalloc(..., size={}) failed", size);
+		throw std::bad_alloc();
+	}
+
+	auto& mm = MemoryManager::get();
+
+	// assemble name for this block
+	std::stringstream name;
+	name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
+
+	auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
+	auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
+
+	const auto localAddr = reinterpret_cast<uintptr_t>(addr);
+	std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
+	        mem(new MemoryBlock(localAddr, size, blockAddrSpaceId), this->free);
+
+	insertMemoryBlock(*mem);
+
+	gpu.makeAccessibleToPCIeAndVA(*mem);
+
+	return mem;
+}
+
+
+Gpu::Gpu(int gpuId) :
+    pImpl{std::make_unique<impl>()},
+    gpuId(gpuId)
+{
+	logger = loggerGetOrCreate(getName());
+
+	pImpl->gdr = gdr_open();
+	if(pImpl->gdr == nullptr) {
+		logger->warn("No GDRcopy support enabled, cannot open /dev/gdrdrv");
+	}
+}
+
+bool Gpu::init()
+{
+	auto& mm = MemoryManager::get();
+
+	const auto gpuPciEAddrSpaceName = mm.getMasterAddrSpaceName(getName(), "PCIe");
+	masterPciEAddrSpaceId = mm.getOrCreateAddressSpace(gpuPciEAddrSpaceName);
+
+	allocator = std::make_unique<GpuAllocator>(*this);
+
+	cudaDeviceProp deviceProp;
+	cudaGetDeviceProperties(&deviceProp, gpuId);
+
+	pImpl->pdev.slot = {
+	    deviceProp.pciDomainID,
+	    deviceProp.pciBusID,
+	    deviceProp.pciDeviceID,
+	    0};
+
+	struct pci_region* pci_regions = nullptr;
+	const size_t pci_num_regions = pci_get_regions(&pImpl->pdev, &pci_regions);
+	for(size_t i = 0; i < pci_num_regions; i++) {
+		const size_t region_size = pci_regions[i].end - pci_regions[i].start + 1;
+		logger->info("BAR{}: bus addr={:#x} size={:#x}",
+		             pci_regions[i].num, pci_regions[i].start, region_size);
+
+		char name[] = "BARx";
+		name[3] = '0' + pci_regions[i].num;
+
+		auto gpuBarXAddrSpaceName = mm.getSlaveAddrSpaceName(getName(), name);
+		auto gpuBarXAddrSpaceId = mm.getOrCreateAddressSpace(gpuBarXAddrSpaceName);
+
+		mm.createMapping(pci_regions[i].start, 0, region_size,
+		                 std::string("PCI-") + name,
+		                 mm.getPciAddressSpace(), gpuBarXAddrSpaceId);
+	}
+
+	free(pci_regions);
+
+	return true;
+}
+
+
+std::list<std::unique_ptr<Gpu>>
+GpuFactory::make()
+{
+	int deviceCount = 0;
+	cudaGetDeviceCount(&deviceCount);
+
+	std::list<std::unique_ptr<Gpu>> gpuList;
+
+	for(int gpuId = 0; gpuId < deviceCount; gpuId++) {
+		if(cudaSetDevice(gpuId) != cudaSuccess) {
+			logger->warn("Cannot activate GPU {}", gpuId);
+			continue;
+		}
+
+		auto gpu = std::make_unique<Gpu>(gpuId);
+
+		if(not gpu->init()) {
+			logger->warn("Cannot initialize GPU {}", gpuId);
+			continue;
+		}
+
+		gpuList.emplace_back(std::move(gpu));
+	}
+
+	logger->info("Initialized {} GPUs", gpuList.size());
+	for(auto& gpu : gpuList) {
+		logger->debug(" - {}", gpu->getName());
+	}
+
+	return gpuList;
+}
+
 } // namespace villas
 } // namespace gpu
diff --git a/fpga/lib/gpu/include/villas/gpu.hpp b/fpga/lib/gpu/include/villas/gpu.hpp
index 903dd08c2..88b316815 100644
--- a/fpga/lib/gpu/include/villas/gpu.hpp
+++ b/fpga/lib/gpu/include/villas/gpu.hpp
@@ -1,7 +1,87 @@
 #pragma once
 
+#include <sstream>
+
+#include <plugin.hpp>
+#include <memory_manager.hpp>
+#include <memory.hpp>
+#include <villas/log.hpp>
+
+
 namespace villas {
 namespace gpu {
 
+class GpuAllocator;
+
+class Gpu {
+	friend GpuAllocator;
+public:
+	Gpu(int gpuId);
+	~Gpu();
+
+	bool init();
+
+	std::string getName() const;
+
+	GpuAllocator& getAllocator() const
+	{ return *allocator; }
+
+
+	bool makeAccessibleToPCIeAndVA(const MemoryBlock& mem);
+
+	/// Make some memory block accssible for this GPU
+	bool makeAccessibleFromPCIeOrHostRam(const MemoryBlock& mem);
+
+	void memcpySync(const MemoryBlock& src, const MemoryBlock& dst, size_t size);
+
+	void memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size);
+
+private:
+	bool registerIoMemory(const MemoryBlock& mem);
+	bool registerHostMemory(const MemoryBlock& mem);
+
+private:
+	class impl;
+	std::unique_ptr<impl> pImpl;
+
+	// master, will be used to derived slave addr spaces for allocation
+	MemoryManager::AddressSpaceId masterPciEAddrSpaceId;
+
+	MemoryManager::AddressSpaceId slaveMemoryAddrSpaceId;
+
+	SpdLogger logger;
+
+	int gpuId;
+
+	std::unique_ptr<GpuAllocator> allocator;
+};
+
+
+class GpuAllocator : public BaseAllocator<GpuAllocator> {
+public:
+	GpuAllocator(Gpu& gpu);
+
+	std::string getName() const;
+
+	std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
+	allocateBlock(size_t size);
+
+private:
+	Gpu& gpu;
+};
+
+class GpuFactory : public Plugin {
+public:
+	GpuFactory();
+
+	std::list<std::unique_ptr<Gpu>>
+	make();
+
+	void run(void*);
+
+private:
+	SpdLogger logger;
+};
+
 } // namespace villas
 } // namespace gpu
diff --git a/fpga/lib/gpu/kernels.cu b/fpga/lib/gpu/kernels.cu
new file mode 100644
index 000000000..7e8b5524d
--- /dev/null
+++ b/fpga/lib/gpu/kernels.cu
@@ -0,0 +1,42 @@
+#include <stdio.h>
+
+#include <villas/gpu.hpp>
+#include "kernels.hpp"
+
+#include "cuda_runtime.h"
+#include <cuda.h>
+
+namespace villas {
+namespace gpu {
+
+
+__global__ void
+kernel_mailbox(volatile uint32_t *mailbox, volatile uint32_t* counter)
+{
+	printf("[gpu] hello!\n");
+	printf("[gpu] mailbox: %p\n", mailbox);
+
+	printf("[kernel] started\n");
+
+	while(1) {
+		if (*mailbox == 1) {
+			*mailbox = 0;
+			printf("[gpu] counter = %d\n", *counter);
+			break;
+		}
+	}
+
+	printf("[gpu] quit\n");
+}
+
+__global__ void
+kernel_memcpy(volatile uint8_t* dst, volatile uint8_t* src, size_t length)
+{
+	while(length > 0) {
+		*dst++ = *src++;
+		length--;
+	}
+}
+
+} // namespace villas
+} // namespace gpu
diff --git a/fpga/lib/gpu/kernels.hpp b/fpga/lib/gpu/kernels.hpp
new file mode 100644
index 000000000..986eba31e
--- /dev/null
+++ b/fpga/lib/gpu/kernels.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <cstdint>
+#include <cuda_runtime.h>
+
+namespace villas {
+namespace gpu {
+
+__global__ void
+kernel_mailbox(volatile uint32_t *mailbox, volatile uint32_t* counter);
+
+__global__ void
+kernel_memcpy(volatile uint8_t* dst, volatile uint8_t* src, size_t length);
+
+} // namespace villas
+} // namespace gpu
diff --git a/fpga/tests/CMakeLists.txt b/fpga/tests/CMakeLists.txt
index 2e4cfe041..f36647ab3 100644
--- a/fpga/tests/CMakeLists.txt
+++ b/fpga/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ set(SOURCES
 	logging.cpp
 	dma.cpp
     fifo.cpp
+	gpu.cpp
 #	hls.c
 #	intc.c
 #	rtds_rtt.c
diff --git a/fpga/tests/gpu.cpp b/fpga/tests/gpu.cpp
new file mode 100644
index 000000000..bdc3aba92
--- /dev/null
+++ b/fpga/tests/gpu.cpp
@@ -0,0 +1,129 @@
+#include <criterion/criterion.h>
+
+#include <map>
+#include <string>
+
+#include <villas/log.hpp>
+#include <villas/fpga/card.hpp>
+#include <villas/fpga/ips/dma.hpp>
+#include <villas/fpga/ips/bram.hpp>
+
+#include <villas/utils.h>
+
+#include "global.hpp"
+
+#include <villas/memory.hpp>
+#include <villas/gpu.hpp>
+
+
+Test(fpga, gpu_dma, .description = "GPU DMA tests")
+{
+	auto logger = loggerGetOrCreate("unittest:dma");
+
+	auto& card = state.cards.front();
+
+	villas::Plugin* plugin = villas::Plugin::lookup(villas::Plugin::Type::Gpu, "");
+	auto gpuPlugin = dynamic_cast<villas::gpu::GpuFactory*>(plugin);
+	cr_assert_not_null(gpuPlugin, "No GPU plugin found");
+
+	auto gpus = gpuPlugin->make();
+	cr_assert(gpus.size() > 0, "No GPUs found");
+
+	// just get first cpu
+	auto& gpu = gpus.front();
+
+	size_t count = 0;
+	for(auto& ip : card->ips) {
+		// skip non-dma IPs
+		if(*ip != villas::fpga::Vlnv("xilinx.com:ip:axi_bram_ctrl:"))
+			continue;
+
+		logger->info("Testing {}", *ip);
+
+		auto bram = reinterpret_cast<villas::fpga::ip::Bram*>(ip.get());
+		cr_assert_not_null(bram, "Couldn't find BRAM");
+
+		count++;
+
+		size_t len = 4 * (1 << 10);
+
+		/* Allocate memory to use with DMA */
+
+		auto bram0 = bram->getAllocator().allocate<char>(len);
+		auto bram1 = bram->getAllocator().allocate<char>(len);
+
+		gpu->makeAccessibleFromPCIeOrHostRam(bram0.getMemoryBlock());
+		gpu->makeAccessibleFromPCIeOrHostRam(bram1.getMemoryBlock());
+
+		auto hostRam0 = villas::HostRam::getAllocator().allocate<char>(len);
+		auto hostRam1 = villas::HostRam::getAllocator().allocate<char>(len);
+
+		gpu->makeAccessibleFromPCIeOrHostRam(hostRam0.getMemoryBlock());
+		gpu->makeAccessibleFromPCIeOrHostRam(hostRam1.getMemoryBlock());
+
+		auto dmaRam0 = villas::HostDmaRam::getAllocator().allocate<char>(len);
+		auto dmaRam1 = villas::HostDmaRam::getAllocator().allocate<char>(len);
+
+		gpu->makeAccessibleFromPCIeOrHostRam(dmaRam0.getMemoryBlock());
+		gpu->makeAccessibleFromPCIeOrHostRam(dmaRam1.getMemoryBlock());
+
+		auto gpuMem0 = gpu->getAllocator().allocate<char>(64 << 10);
+		auto gpuMem1 = gpu->getAllocator().allocate<char>(64 << 10);
+
+		gpu->makeAccessibleToPCIeAndVA(gpuMem0.getMemoryBlock());
+		gpu->makeAccessibleToPCIeAndVA(gpuMem1.getMemoryBlock());
+
+
+//		auto& src = bram0;
+//		auto& dst = bram1;
+
+//		auto& src = hostRam0;
+//		auto& dst = hostRam1;
+
+		auto& src = dmaRam0;
+//		auto& dst = dmaRam1;
+
+//		auto& src = gpuMem0;
+		auto& dst = gpuMem1;
+
+
+		std::list<std::pair<std::string, std::function<void()>>> memcpyFuncs = {
+		    {"cudaMemcpy", [&]() {gpu->memcpySync(src.getMemoryBlock(), dst.getMemoryBlock(), len);}},
+		    {"CUDA kernel", [&]() {gpu->memcpyKernel(src.getMemoryBlock(), dst.getMemoryBlock(), len);}},
+	    };
+
+		auto dmaIp = card->lookupIp(villas::fpga::Vlnv("xilinx.com:ip:axi_dma:"));
+		auto dma = dynamic_cast<villas::fpga::ip::Dma*>(dmaIp);
+
+		if(dma != nullptr and dma->connectLoopback()) {
+			memcpyFuncs.push_back({
+			    "DMA memcpy", [&]() {
+			        if(not dma->makeAccesibleFromVA(src.getMemoryBlock()) or
+			           not dma->makeAccesibleFromVA(dst.getMemoryBlock())) {
+			            return;
+			        }
+			        dma->memcpy(src.getMemoryBlock(), dst.getMemoryBlock(), len);
+			    }});
+		}
+
+		for(auto& [name, memcpyFunc] : memcpyFuncs) {
+			logger->info("Testing {}", name);
+
+			/* Get new random data */
+			const size_t lenRandom = read_random(&src, len);
+			cr_assert(len == lenRandom, "Failed to get random data");
+
+			memcpyFunc();
+			const bool success = memcmp(&src, &dst, len) == 0;
+
+			logger->info("  {}", success ?
+			                 TXT_GREEN("Passed") :
+			                 TXT_RED("Failed"));
+		}
+
+		villas::MemoryManager::get().dump();
+	}
+
+
+	cr_assert(count > 0, "No BRAM found");
+}

From d2384abb9df0c8517fcc5765bb342859d15b9d87 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Wed, 16 May 2018 10:58:18 +0200
Subject: [PATCH 31/31] cmake: only build GPU library if CUDA is present

---
 fpga/lib/CMakeLists.txt   | 12 ++++++++++--
 fpga/tests/CMakeLists.txt |  5 ++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/fpga/lib/CMakeLists.txt b/fpga/lib/CMakeLists.txt
index fff01b8e1..5ff3a50ba 100644
--- a/fpga/lib/CMakeLists.txt
+++ b/fpga/lib/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_subdirectory(common)
-add_subdirectory(gpu)
 
 set(SOURCES
 	vlnv.cpp
@@ -36,7 +35,16 @@ find_package(Threads)
 add_library(villas-fpga SHARED ${SOURCES})
 
 target_link_libraries(villas-fpga PUBLIC villas-common)
-target_link_libraries(villas-fpga PUBLIC villas-gpu)
+
+# GPU library is optional, check for CUDA presence
+include(CheckLanguage)
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+	add_subdirectory(gpu)
+	target_link_libraries(villas-fpga PUBLIC villas-gpu)
+else()
+	message("No CUDA support, not building GPU library")
+endif()
 
 target_compile_definitions(villas-fpga PRIVATE
 	BUILDID=\"abc\"
diff --git a/fpga/tests/CMakeLists.txt b/fpga/tests/CMakeLists.txt
index f36647ab3..c7b762ac3 100644
--- a/fpga/tests/CMakeLists.txt
+++ b/fpga/tests/CMakeLists.txt
@@ -4,7 +4,6 @@ set(SOURCES
 	logging.cpp
 	dma.cpp
     fifo.cpp
-	gpu.cpp
 #	hls.c
 #	intc.c
 #	rtds_rtt.c
@@ -13,6 +12,10 @@ set(SOURCES
     graph.cpp
 )
 
+if(CMAKE_CUDA_COMPILER)
+	list(APPEND SOURCES gpu.cpp)
+endif()
+
 add_executable(unit-tests ${SOURCES})
 
 find_package(Criterion REQUIRED)