Merge branch 'feature/gpu' into 'develop'

Add basic GPU/CUDA integration as a shared library See merge request acs/public/villas/VILLASfpga-code!9
2025-03-30 00:00:11 +01:00 · 2018-05-16 11:15:36 +02:00 · 2018-05-16 11:15:36 +02:00 · 929ed2f393
commit 929ed2f393
parent e91be5ed2c d2384abb9d
33 changed files with 1577 additions and 145 deletions
--- a/fpga/.gitmodules
+++ b/fpga/.gitmodules
@ -4,3 +4,9 @@
 [submodule "thirdparty/libxil"]
 	path = thirdparty/libxil
 	url = https://git.rwth-aachen.de/acs/public/villas/libxil.git
+[submodule "lib/gpu/gdrcopy"]
+	path = lib/gpu/gdrcopy
+	url = https://github.com/daniel-k/gdrcopy.git
+[submodule "thirdparty/udmabuf"]
+	path = thirdparty/udmabuf
+	url = https://github.com/ikwzm/udmabuf
--- a/fpga/etc/fpga.json
+++ b/fpga/etc/fpga.json
@ -120,30 +120,30 @@
            "M_AXI_MM2S": {
              "pcie_0_axi_pcie_0": {
                "BAR0": {
-                  "baseaddr": 2147483648,
+                  "baseaddr": 0,
                  "highaddr": 4294967295,
-                  "size": 2147483648
+                  "size": 4294967296
                }
              }
            },
            "M_AXI_S2MM": {
              "pcie_0_axi_pcie_0": {
                "BAR0": {
-                  "baseaddr": 2147483648,
+                  "baseaddr": 0,
                  "highaddr": 4294967295,
-                  "size": 2147483648
+                  "size": 4294967296
                }
              }
            }
          },
          "ports": [
            {
-              "role": "initiator",
+              "role": "master",
              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:1",
              "name": "MM2S"
            },
            {
-              "role": "target",
+              "role": "slave",
              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:1",
              "name": "S2MM"
            }
@ -159,30 +159,30 @@
            "M_AXI_MM2S": {
              "pcie_0_axi_pcie_0": {
                "BAR0": {
-                  "baseaddr": 2147483648,
+                  "baseaddr": 0,
                  "highaddr": 4294967295,
-                  "size": 2147483648
+                  "size": 4294967296
                }
              }
            },
            "M_AXI_S2MM": {
              "pcie_0_axi_pcie_0": {
                "BAR0": {
-                  "baseaddr": 2147483648,
+                  "baseaddr": 0,
                  "highaddr": 4294967295,
-                  "size": 2147483648
+                  "size": 4294967296
                }
              }
            }
          },
          "ports": [
            {
-              "role": "initiator",
+              "role": "master",
              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:6",
              "name": "MM2S"
            },
            {
-              "role": "target",
+              "role": "slave",
              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:6",
              "name": "S2MM"
            }
@ -214,22 +214,22 @@
          "vlnv": "xilinx.com:ip:axis_switch:1.1",
          "ports": [
            {
-              "role": "initiator",
+              "role": "master",
              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:3",
              "name": "M03_AXIS"
            },
            {
-              "role": "target",
+              "role": "slave",
              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:3",
              "name": "S03_AXIS"
            },
            {
-              "role": "initiator",
+              "role": "master",
              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:4",
              "name": "M04_AXIS"
            },
            {
-              "role": "target",
+              "role": "slave",
              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:4",
              "name": "S04_AXIS"
            }
@ -237,7 +237,7 @@
          "num_ports": 7
        },
        "hier_0_hls_dft_0": {
-          "vlnv": "acs.eonerc.rwth-aachen.de:hls:hls_dft:1.0",
+          "vlnv": "acs.eonerc.rwth-aachen.de:hls:hls_dft:1.1",
          "ports": [
            {
              "role": "master",
@ -249,7 +249,10 @@
              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:5",
              "name": "input_r"
            }
-          ]
+          ],
+          "irqs": {
+            "interrupt": "pcie_0_axi_pcie_intc_0:1"
+          }
        },
        "hier_0_rtds_axis_0": {
          "vlnv": "acs.eonerc.rwth-aachen.de:user:rtds_axis:1.0",
@ -358,6 +361,19 @@
                }
              }
            }
+          },
+          "axi_bars": {
+            "BAR0": {
+              "translation": 0,
+              "baseaddr": 0,
+              "highaddr": 4294967295,
+              "size": 4294967296
+            }
+          },
+          "pcie_bars": {
+            "BAR0": {
+              "translation": 0
+            }
          }
        },
        "pcie_0_axi_pcie_intc_0": {
--- a/fpga/include/villas/directed_graph.hpp
+++ b/fpga/include/villas/directed_graph.hpp
@ -5,6 +5,7 @@
 #include <memory>
 #include <sstream>
 #include <string>
+#include <fstream>
 #include <stdexcept>
 #include <algorithm>

@ -58,6 +59,12 @@ public:
 	operator==(const Edge& other)
 	{ return this->id == other.id; }

+	Vertex::Identifier getVertexTo() const
+	{ return to; }
+
+	Vertex::Identifier getVertexFrom() const
+	{ return from; }
+
 private:
 	Identifier id;
 	Vertex::Identifier from;
@ -81,7 +88,7 @@ public:

 	std::shared_ptr<VertexType> getVertex(VertexIdentifier vertexId) const
 	{
-		if(vertexId < 0 or vertexId >= lastVertexId)
+		if(vertexId >= lastVertexId)
 			throw std::invalid_argument("vertex doesn't exist");

 		// cannot use [] operator, because creates non-existing elements
@ -92,7 +99,10 @@ public:
 	template<class UnaryPredicate>
 	VertexIdentifier findVertex(UnaryPredicate p)
 	{
-		for(auto& [vertexId, vertex] : vertices) {
+		for(auto& v : vertices) {
+			auto& vertexId = v.first;
+			auto& vertex = v.second;
+
 			if(p(vertex)) {
 				return vertexId;
 			}
@ -103,7 +113,7 @@ public:

 	std::shared_ptr<EdgeType> getEdge(EdgeIdentifier edgeId) const
 	{
-		if(edgeId < 0 or edgeId >= lastEdgeId)
+		if(edgeId >= lastEdgeId)
 			throw std::invalid_argument("edge doesn't exist");

 		// cannot use [] operator, because creates non-existing elements
@ -177,7 +187,9 @@ public:
 		// delete every edge that start or ends at this vertex
 		auto it = edges.begin();
 		while(it != edges.end()) {
-			auto& [edgeId, edge] = *it;
+			auto& edgeId = it->first;
+			auto& edge = it->second;
+
 			bool removeEdge = false;

 			if(edge->to == vertexId) {
@ -207,9 +219,17 @@ public:
 	vertexGetEdges(VertexIdentifier vertexId) const
 	{ return getVertex(vertexId)->edges; }

+
+	using check_path_fn = std::function<bool(const Path&)>;
+
+	static bool
+	checkPath(const Path&)
+	{ return true; }
+
 	bool getPath(VertexIdentifier fromVertexId,
 	             VertexIdentifier toVertexId,
-	             Path& path)
+	             Path& path,
+	             check_path_fn pathCheckFunc = checkPath)
 	{
 		if(fromVertexId == toVertexId) {
 			// arrived at the destination
@ -239,7 +259,8 @@ public:
 				path.push_back(edgeId);

 				// recursive, depth-first search
-				if(getPath(edgeOfFromVertex->to, toVertexId, path)) {
+				if(getPath(edgeOfFromVertex->to, toVertexId, path, pathCheckFunc) and
+				   pathCheckFunc(path)) {
 					// path found, we're done
 				    return true;
 				} else {
@ -252,11 +273,11 @@ public:
 		return false;
 	}

-	void dump()
+	void dump(const std::string& fileName = "")
 	{
 		logger->info("Vertices:");
-		for(auto& [vertexId, vertex] : vertices) {
-			(void) vertexId;
+		for(auto& v : vertices) {
+			auto& vertex = v.second;

 			// format connected vertices into a list
 			std::stringstream ssEdges;
@ -267,11 +288,29 @@ public:
 			logger->info("  {} connected to: {}", *vertex, ssEdges.str());
 		}

+		std::fstream s(fileName, s.out | s.trunc);
+		if(s.is_open()) {
+			s << "digraph memgraph {" << std::endl;
+		}
+
 		logger->info("Edges:");
-		for(auto& [edgeId, edge] : edges) {
-			(void) edgeId;
+		for(auto& e : edges) {
+			auto& edge = e.second;

 			logger->info("  {}: {} -> {}", *edge, edge->from, edge->to);
+			if(s.is_open()) {
+				auto from = getVertex(edge->from);
+				auto to = getVertex(edge->to);
+
+				s << std::dec;
+				s << "  \"" << *from << "\" -> \"" << *to << "\""
+				  << " [label=\"" << *edge << "\"];" << std::endl;
+			}
+		}
+
+		if(s.is_open()) {
+			s << "}" << std::endl;
+			s.close();
 		}
 	}

--- a/fpga/include/villas/fpga/card.hpp
+++ b/fpga/include/villas/fpga/card.hpp
@ -98,8 +98,9 @@ public:	// TODO: make this private

 	std::string name;			/**< The name of the FPGA card */

-	struct pci *pci;
+	struct pci* pci;
 	struct pci_device filter;		/**< Filter for PCI device. */
+	struct pci_device* pdev;		/**< PCI device handle */

 	/// The VFIO container that this card is part of
 	std::shared_ptr<VfioContainer> vfioContainer;
--- a/fpga/include/villas/fpga/ip.hpp
+++ b/fpga/include/villas/fpga/ip.hpp
@ -193,6 +193,10 @@ protected:
 	InterruptController*
 	getInterruptController(const std::string& interruptName) const;

+	MemoryManager::AddressSpaceId
+	getMasterAddrSpaceByInterface(const std::string& masterInterfaceName) const
+	{ return busMasterInterfaces.at(masterInterfaceName); }
+
 protected:
 	struct IrqPort {
 		int num;
--- a/fpga/include/villas/fpga/ips/dma.hpp
+++ b/fpga/include/villas/fpga/ips/dma.hpp
@ -55,7 +55,9 @@ public:
 	bool readComplete()
 	{ return hasScatterGather() ? readCompleteSG() : readCompleteSimple(); }

-	bool pingPong(const MemoryBlock& src, const MemoryBlock& dst, size_t len);
+	bool memcpy(const MemoryBlock& src, const MemoryBlock& dst, size_t len);
+
+	bool makeAccesibleFromVA(const MemoryBlock& mem);

 	inline bool
 	hasScatterGather() const
@ -72,6 +74,8 @@ private:
 	bool writeCompleteSimple();
 	bool readCompleteSimple();

+	bool isMemoryBlockAccesible(const MemoryBlock& mem, const std::string& interface);
+
 private:
 	static constexpr char registerMemory[] = "Reg";

--- a/fpga/include/villas/fpga/ips/pcie.hpp
+++ b/fpga/include/villas/fpga/ips/pcie.hpp
@ -52,6 +52,19 @@ public:
 private:
 	static constexpr char axiInterface[] = "M_AXI";
 	static constexpr char pcieMemory[] = "BAR0";
+
+	struct AxiBar {
+		uintptr_t base;
+		size_t size;
+		uintptr_t translation;
+	};
+
+	struct PciBar {
+		uintptr_t translation;
+	};
+
+	std::map<std::string, AxiBar> axiToPcieTranslations;
+	std::map<std::string, PciBar> pcieToAxiTranslations;
 };


@ -64,6 +77,8 @@ public:
 	getCompatibleVlnvString()
 	{ return "xilinx.com:ip:axi_pcie:"; }

+	bool configureJson(IpCore& ip, json_t *json_ip);
+
 	IpCore* create()
 	{ return new AxiPciExpressBridge; }

--- a/fpga/include/villas/kernel/pci.h
+++ b/fpga/include/villas/kernel/pci.h
@ -9,6 +9,7 @@

 #pragma once

+#include <stdint.h>
 #include "list.h"

 #define PCI_SLOT(devfn)		(((devfn) >> 3) & 0x1f)
@ -33,6 +34,13 @@ struct pci_device {
 	} slot;			/**< Bus, Device, Function (BDF) */
 };

+struct pci_region {
+	int num;
+	uintptr_t start;
+	uintptr_t end;
+	unsigned long long flags;
+};
+
 struct pci {
 	struct list devices; /**< List of available PCI devices in the system (struct pci_device) */
 };
@ -66,6 +74,8 @@ int pci_attach_driver(const struct pci_device *d, const char *driver);
 /** Return the IOMMU group of this PCI device or -1 if the device is not in a group. */
 int pci_get_iommu_group(const struct pci_device *d);

+size_t pci_get_regions(const struct pci_device *d, struct pci_region** regions);
+
 #ifdef __cplusplus
 }
 #endif
--- a/fpga/include/villas/kernel/vfio.hpp
+++ b/fpga/include/villas/kernel/vfio.hpp
@ -18,7 +18,8 @@
 #include <linux/vfio.h>
 #include <sys/mman.h>

-#define VFIO_DEV(x)	"/dev/vfio/" x
+#define VFIO_PATH	"/dev/vfio/"
+#define VFIO_DEV	VFIO_PATH "vfio"

 /* Forward declarations */
 struct pci_device;
@ -90,7 +91,7 @@ public:
 	~VfioGroup();

 	static std::unique_ptr<VfioGroup>
-	attach(int containerFd, int groupIndex);
+	attach(VfioContainer& container, int groupIndex);

 private:
 	/// VFIO group file descriptor
@ -135,6 +136,12 @@ public:
 	/** munmap() a region which has been mapped by vfio_map_region() */
 	bool memoryUnmap(uintptr_t phys, size_t length);

+	bool isIommuEnabled() const
+	{ return this->hasIommu; }
+
+	const int& getFd() const
+	{ return fd; }
+
 private:
 	VfioGroup& getOrAttachGroup(int index);

@ -143,6 +150,7 @@ private:
 	int version;
 	int extensions;
 	uint64_t iova_next;			/**< Next free IOVA address */
+	bool hasIommu;

 	/// All groups bound to this container
 	std::list<std::unique_ptr<VfioGroup>> groups;
--- a/fpga/include/villas/memory.hpp
+++ b/fpga/include/villas/memory.hpp
@ -116,6 +116,8 @@ public:
 		free = [&](MemoryBlock* mem) {
 			logger->warn("no free callback defined for addr space {}, not freeing",
 			             mem->getAddrSpaceId());
+
+			removeMemoryBlock(*mem);
 		};
 	}

@ -128,6 +130,21 @@ public:
 	{
 		const size_t size = num * sizeof(T);
 		auto mem = allocateBlock(size);
+
+		// Check if the allocated memory is really accessible by writing to the
+		// allocated memory and reading back. Exponentially increase offset to
+		// speed up testing.
+		MemoryAccessor<volatile uint8_t> byteAccessor(*mem);
+		size_t idx = 0;
+		for(int i = 0; idx < mem->getSize(); i++, idx = (1 << i)) {
+			auto val = static_cast<uint8_t>(i);
+			byteAccessor[idx] = val;
+			if(byteAccessor[idx] != val) {
+				logger->error("Cannot access allocated memory");
+				throw std::bad_alloc();
+			}
+		}
+
 		return MemoryAccessor<T>(std::move(mem));
 	}

@ -179,6 +196,9 @@ public:
 	size_t getAvailableMemory() const
 	{ return memorySize - nextFreeAddress; }

+	size_t getSize() const
+	{ return memorySize; }
+
 	std::string getName() const;

 	std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
@ -225,4 +245,41 @@ private:
 	static HostRamAllocator allocator;
 };

+
+class HostDmaRam {
+private:
+
+	static std::string
+	getUdmaBufName(int num);
+
+	static std::string
+	getUdmaBufBasePath(int num);
+
+	static size_t
+	getUdmaBufBufSize(int num);
+
+	static uintptr_t
+	getUdmaBufPhysAddr(int num);
+
+public:
+	class HostDmaRamAllocator : public LinearAllocator {
+	public:
+		HostDmaRamAllocator(int num);
+
+		virtual ~HostDmaRamAllocator();
+
+		std::string getName() const
+		{ return getUdmaBufName(num); }
+
+	private:
+		int num;
+	};
+
+	static HostDmaRamAllocator&
+	getAllocator(int num = 0);
+
+private:
+	static std::map<int, std::unique_ptr<HostDmaRamAllocator>> allocators;
+};
+
 } // namespace villas
--- a/fpga/include/villas/memory_manager.hpp
+++ b/fpga/include/villas/memory_manager.hpp
@ -3,6 +3,7 @@
 #include <cstdint>
 #include <string>
 #include <map>
+#include <stdexcept>
 #include <unistd.h>

 #include "log.hpp"
@ -73,7 +74,12 @@ private:
 	// This is a singleton, so private constructor ...
 	MemoryManager() :
 	    memoryGraph("MemoryGraph"),
-	    logger(loggerGetOrCreate("MemoryManager")) {}
+	    logger(loggerGetOrCreate("MemoryManager"))
+	{
+		pathCheckFunc = [&](const MemoryGraph::Path& path) {
+			return this->pathCheck(path);
+		};
+	}

 	// ... and no copying or assigning
 	MemoryManager(const MemoryManager&) = delete;
@ -109,7 +115,7 @@ private:
 			return stream << static_cast<const Edge&>(mapping) << " = "
 			              << mapping.name
 			              << std::hex
-			              << "(src=0x"   << mapping.src
+			              << " (src=0x"  << mapping.src
 			              << ", dest=0x" << mapping.dest
 			              << ", size=0x" << mapping.size
 			              << ")";
@ -144,6 +150,8 @@ public:
 	using AddressSpaceId = MemoryGraph::VertexIdentifier;
 	using MappingId = MemoryGraph::EdgeIdentifier;

+	struct InvalidTranslation : public std::exception {};
+
 	/// Get singleton instance
 	static MemoryManager&
 	get();
@ -152,6 +160,10 @@ public:
 	getProcessAddressSpace()
 	{ return getOrCreateAddressSpace("villas-fpga"); }

+	AddressSpaceId
+	getPciAddressSpace()
+	{ return getOrCreateAddressSpace("PCIe"); }
+
 	AddressSpaceId
 	getProcessAddressSpaceMemoryBlock(const std::string& memoryBlock)
 	{ return getOrCreateAddressSpace(getSlaveAddrSpaceName("villas-fpga", memoryBlock)); }
@ -184,6 +196,9 @@ public:
 	AddressSpaceId
 	findAddressSpace(const std::string& name);

+	std::list<AddressSpaceId>
+	findPath(AddressSpaceId fromAddrSpaceId, AddressSpaceId toAddrSpaceId);
+
 	MemoryTranslation
 	getTranslation(AddressSpaceId fromAddrSpaceId, AddressSpaceId toAddrSpaceId);

@ -203,6 +218,9 @@ public:
 	dump()
 	{ memoryGraph.dump(); }

+	void
+	dumpToFile(const std::string& fileName)
+	{ memoryGraph.dump(fileName); }

 private:
 	/// Convert a Mapping to MemoryTranslation for calculations
@ -210,6 +228,8 @@ private:
 	getTranslationFromMapping(const Mapping& mapping)
 	{ return MemoryTranslation(mapping.src, mapping.dest, mapping.size); }

+	bool
+	pathCheck(const MemoryGraph::Path& path);

 private:
 	/// Directed graph that stores address spaces and memory mappings
@ -221,6 +241,8 @@ private:
 	/// Logger for universal access in this class
 	SpdLogger logger;

+	MemoryGraph::check_path_fn pathCheckFunc;
+
 	/// Static pointer to global instance, because this is a singleton
 	static MemoryManager* instance;
 };
--- a/fpga/include/villas/plugin.hpp
+++ b/fpga/include/villas/plugin.hpp
@ -40,6 +40,7 @@ public:
 		Unknown,
 		FpgaIp,
 		FpgaCard,
+		Gpu
 	};

 	Plugin(Type type, const std::string& name);
--- a/fpga/include/villas/utils.hpp
+++ b/fpga/include/villas/utils.hpp
@ -10,6 +10,15 @@ namespace utils {
 std::vector<std::string>
 tokenize(std::string s, std::string delimiter);

+
+template<typename T>
+void
+assertExcept(bool condition, const T& exception)
+{
+	if(not condition)
+		throw exception;
+}
+
 } // namespace utils
 } // namespace villas

--- a/fpga/lib/CMakeLists.txt
+++ b/fpga/lib/CMakeLists.txt
@ -36,6 +36,16 @@ add_library(villas-fpga SHARED ${SOURCES})

 target_link_libraries(villas-fpga PUBLIC villas-common)

+# GPU library is optional, check for CUDA presence
+include(CheckLanguage)
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+	add_subdirectory(gpu)
+	target_link_libraries(villas-fpga PUBLIC villas-gpu)
+else()
+	message("No CUDA support, not building GPU library")
+endif()
+
 target_compile_definitions(villas-fpga PRIVATE
 	BUILDID=\"abc\"
 	_GNU_SOURCE
--- a/fpga/lib/card.cpp
+++ b/fpga/lib/card.cpp
@ -165,6 +165,11 @@ PCIeCard::lookupIp(const Vlnv& vlnv) const
 bool
 PCIeCard::mapMemoryBlock(const MemoryBlock& block)
 {
+	if(not vfioContainer->isIommuEnabled()) {
+		logger->warn("VFIO mapping not supported without IOMMU");
+		return false;
+	}
+
 	auto& mm = MemoryManager::get();
 	const auto& addrSpaceId = block.getAddrSpaceId();

@ -175,7 +180,6 @@ PCIeCard::mapMemoryBlock(const MemoryBlock& block)
 		logger->debug("Create VFIO mapping for {}", addrSpaceId);
 	}

-
 	auto translationFromProcess = mm.getTranslationFromProcess(addrSpaceId);
 	uintptr_t processBaseAddr = translationFromProcess.getLocalAddr(0);
 	uintptr_t iovaAddr = vfioContainer->memoryMap(processBaseAddr,
@ -188,10 +192,8 @@ PCIeCard::mapMemoryBlock(const MemoryBlock& block)
 		return false;
 	}

-
-
 	mm.createMapping(iovaAddr, 0, block.getSize(),
-	                 "vfio",
+	                 "VFIO-D2H",
 	                 this->addrSpaceIdDeviceToHost,
 	                 addrSpaceId);

@ -203,11 +205,8 @@ PCIeCard::mapMemoryBlock(const MemoryBlock& block)


 bool
-fpga::PCIeCard::init()
+PCIeCard::init()
 {
-	struct pci_device *pdev;
-
-	auto& mm = MemoryManager::get();
 	logger = getLogger();

 	logger->info("Initializing FPGA card {}", name);
@ -223,41 +222,12 @@ fpga::PCIeCard::init()
 	VfioDevice& device = vfioContainer->attachDevice(pdev);
 	this->vfioDevice = &device;

-
 	/* Enable memory access and PCI bus mastering for DMA */
 	if (not device.pciEnable()) {
 		logger->error("Failed to enable PCI device");
 		return false;
 	}

-	/* Map PCIe BAR */
-	const void* bar0_mapped = vfioDevice->regionMap(VFIO_PCI_BAR0_REGION_INDEX);
-	if (bar0_mapped == MAP_FAILED) {
-		logger->error("Failed to mmap() BAR0");
-		return false;
-	}
-
-	// determine size of BAR0 region
-	const size_t bar0_size = vfioDevice->regionGetSize(VFIO_PCI_BAR0_REGION_INDEX);
-
-
-	/* Link mapped BAR0 to global memory graph */
-
-	// get the address space of the current application
-	const auto villasAddrSpace = mm.getProcessAddressSpace();
-
-	// get the address space for the PCIe proxy we use with VFIO
-	const auto cardPCIeAddrSpaceName = mm.getMasterAddrSpaceName(name, "PCIe");
-
-	// create a new address space for this FPGA card
-	addrSpaceIdHostToDevice = mm.getOrCreateAddressSpace(cardPCIeAddrSpaceName);
-
-	// create a mapping from our address space to the FPGA card via vfio
-	mm.createMapping(reinterpret_cast<uintptr_t>(bar0_mapped),
-	                                   0, bar0_size, "VFIO_map",
-	                                   villasAddrSpace, addrSpaceIdHostToDevice);
-
-
 	/* Reset system? */
 	if (do_reset) {
 		/* Reset / detect PCI device */
--- a/fpga/lib/common/memory.cpp
+++ b/fpga/lib/common/memory.cpp
@ -1,6 +1,9 @@
 #include <sys/mman.h>
 #include <unistd.h>

+#include <sstream>
+#include <fstream>
+
 #include "memory.hpp"

 namespace villas {
@ -62,6 +65,8 @@ LinearAllocator::LinearAllocator(MemoryManager::AddressSpaceId memoryAddrSpaceId
 		              mem->getSize(), mem->getOffset(), mem->getAddrSpaceId());
 		logger->warn("free() not implemented");
 		logger->debug("available memory: {:#x} bytes", getAvailableMemory());
+
+		removeMemoryBlock(*mem);
 	};
 }

@ -133,7 +138,127 @@ HostRam::HostRamAllocator::HostRamAllocator() :
 			logger->warn("munmap() failed for {:#x} of size {:#x}",
 			             mem->getOffset(), mem->getSize());
 		}
+
+		removeMemoryBlock(*mem);
 	};
 }

+
+std::map<int, std::unique_ptr<HostDmaRam::HostDmaRamAllocator>>
+HostDmaRam::allocators;
+
+HostDmaRam::HostDmaRamAllocator::HostDmaRamAllocator(int num) :
+    LinearAllocator(MemoryManager::get().getOrCreateAddressSpace(getUdmaBufName(num)), getUdmaBufBufSize(num)),
+    num(num)
+{
+	auto& mm = MemoryManager::get();
+	logger = loggerGetOrCreate(getName());
+
+	if(getSize() == 0) {
+		logger->error("Zero-sized DMA buffer not supported, is the kernel module loaded?");
+		throw std::bad_alloc();
+	}
+
+	const uintptr_t base = getUdmaBufPhysAddr(num);
+
+	mm.createMapping(base, 0, getSize(), getName() + "-PCI",
+	                 mm.getPciAddressSpace(), getAddrSpaceId());
+
+	const auto bufPath = std::string("/dev/") + getUdmaBufName(num);
+	const int bufFd = open(bufPath.c_str(), O_RDWR | O_SYNC);
+	if(bufFd != -1) {
+		void* buf = mmap(nullptr, getSize(), PROT_READ|PROT_WRITE, MAP_SHARED, bufFd, 0);
+		close(bufFd);
+
+		if(buf != MAP_FAILED) {
+			mm.createMapping(reinterpret_cast<uintptr_t>(buf), 0, getSize(),
+			                 getName() + "-VA",
+			                 mm.getProcessAddressSpace(), getAddrSpaceId());
+		} else {
+			logger->warn("Cannot map {}", bufPath);
+		}
+	} else {
+		logger->warn("Cannot open {}", bufPath);
+	}
+
+	logger->info("Mapped {} of size {} bytes", bufPath, getSize());
+}
+
+HostDmaRam::HostDmaRamAllocator::~HostDmaRamAllocator()
+{
+	auto& mm = MemoryManager::get();
+
+	void* baseVirt;
+	try {
+		auto translation = mm.getTranslationFromProcess(getAddrSpaceId());
+		baseVirt = reinterpret_cast<void*>(translation.getLocalAddr(0));
+	} catch(const std::out_of_range&) {
+		// not mapped, nothing to do
+		return;
+	}
+
+	logger->debug("Unmapping {}", getName());
+
+	// try to unmap it
+	if(::munmap(baseVirt, getSize()) != 0) {
+		logger->warn("munmap() failed for {:p} of size {:#x}",
+		             baseVirt, getSize());
+	}
+}
+
+std::string
+HostDmaRam::getUdmaBufName(int num)
+{
+	std::stringstream name;
+	name << "udmabuf" << num;
+
+	return name.str();
+}
+
+std::string
+HostDmaRam::getUdmaBufBasePath(int num)
+{
+	std::stringstream path;
+	path << "/sys/class/udmabuf/udmabuf" << num << "/";
+	return path.str();
+}
+
+size_t
+HostDmaRam::getUdmaBufBufSize(int num)
+{
+	std::fstream s(getUdmaBufBasePath(num) + "size", s.in);
+	if(s.is_open()) {
+		std::string line;
+		if(std::getline(s, line)) {
+			return std::strtoul(line.c_str(), nullptr, 10);
+		}
+	}
+
+	return 0;
+}
+
+uintptr_t
+HostDmaRam::getUdmaBufPhysAddr(int num)
+{
+	std::fstream s(getUdmaBufBasePath(num) + "phys_addr", s.in);
+	if(s.is_open()) {
+		std::string line;
+		if(std::getline(s, line)) {
+			return std::strtoul(line.c_str(), nullptr, 16);
+		}
+	}
+
+	return UINTPTR_MAX;
+}
+
+HostDmaRam::HostDmaRamAllocator&HostDmaRam::getAllocator(int num)
+{
+	auto& allocator = allocators[num];
+	if(not allocator) {
+		allocator = std::make_unique<HostDmaRamAllocator>(num);
+	}
+
+	return *allocator;
+}
+
 } // namespace villas
--- a/fpga/lib/common/memory_manager.cpp
+++ b/fpga/lib/common/memory_manager.cpp
@ -2,8 +2,11 @@
 #include <limits>
 #include <cstdint>

+#include <villas/utils.hpp>
 #include "memory_manager.hpp"

+using namespace villas::utils;
+
 namespace villas {

 MemoryManager*
@ -70,17 +73,44 @@ MemoryManager::findAddressSpace(const std::string& name)
 	});
 }

+std::list<MemoryManager::AddressSpaceId>
+MemoryManager::findPath(MemoryManager::AddressSpaceId fromAddrSpaceId,
+                        MemoryManager::AddressSpaceId toAddrSpaceId)
+{
+	std::list<AddressSpaceId> path;
+
+	auto fromAddrSpace = memoryGraph.getVertex(fromAddrSpaceId);
+	auto toAddrSpace = memoryGraph.getVertex(toAddrSpaceId);
+
+	// find a path through the memory graph
+	MemoryGraph::Path pathGraph;
+	if(not memoryGraph.getPath(fromAddrSpaceId, toAddrSpaceId, pathGraph, pathCheckFunc)) {
+
+		logger->debug("No translation found from ({}) to ({})",
+		              *fromAddrSpace, *toAddrSpace);
+
+		throw std::out_of_range("no translation found");
+	}
+
+	for(auto& mappingId : pathGraph) {
+		auto mapping = memoryGraph.getEdge(mappingId);
+		path.push_back(mapping->getVertexTo());
+	}
+
+	return path;
+}
+
 MemoryTranslation
 MemoryManager::getTranslation(MemoryManager::AddressSpaceId fromAddrSpaceId,
                              MemoryManager::AddressSpaceId toAddrSpaceId)
 {
 	// find a path through the memory graph
 	MemoryGraph::Path path;
-	if(not memoryGraph.getPath(fromAddrSpaceId, toAddrSpaceId, path)) {
+	if(not memoryGraph.getPath(fromAddrSpaceId, toAddrSpaceId, path, pathCheckFunc)) {
+
 		auto fromAddrSpace = memoryGraph.getVertex(fromAddrSpaceId);
 		auto toAddrSpace = memoryGraph.getVertex(toAddrSpaceId);
-
-		logger->error("No translation found from ({}) to ({})",
+		logger->debug("No translation found from ({}) to ({})",
 		              *fromAddrSpace, *toAddrSpace);

 		throw std::out_of_range("no translation found");
@ -98,6 +128,26 @@ MemoryManager::getTranslation(MemoryManager::AddressSpaceId fromAddrSpaceId,
 	return translation;
 }

+bool
+MemoryManager::pathCheck(const MemoryGraph::Path& path)
+{
+	// start with an identity mapping
+	MemoryTranslation translation(0, 0, SIZE_MAX);
+
+	// Try to add all mappings together to a common translation. If this fails
+	// there is a non-overlapping window
+	for(auto& mappingId : path) {
+		auto mapping = memoryGraph.getEdge(mappingId);
+		try {
+			translation += getTranslationFromMapping(*mapping);
+		} catch(const InvalidTranslation&) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
 uintptr_t
 MemoryTranslation::getLocalAddr(uintptr_t addrInForeignAddrSpace) const
 {
@ -124,9 +174,18 @@ MemoryTranslation::operator+=(const MemoryTranslation& other)
 	const uintptr_t this_dst_high = this->dst + this->size;
 	const uintptr_t other_src_high = other.src + other.size;

+	logger->debug("this->src:      {:#x}", this->src);
+	logger->debug("this->dst:      {:#x}", this->dst);
+	logger->debug("this->size:     {:#x}", this->size);
+	logger->debug("other.src:      {:#x}", other.src);
+	logger->debug("other.dst:      {:#x}", other.dst);
+	logger->debug("other.size:     {:#x}", other.size);
+	logger->debug("this_dst_high:  {:#x}", this_dst_high);
+	logger->debug("other_src_high: {:#x}", other_src_high);
+
 	// make sure there is a common memory area
-	assert(other.src < this_dst_high);
-	assert(this->dst < other_src_high);
+	assertExcept(other.src < this_dst_high, MemoryManager::InvalidTranslation());
+	assertExcept(this->dst < other_src_high, MemoryManager::InvalidTranslation());

 	const uintptr_t hi = std::max(this_dst_high, other_src_high);
 	const uintptr_t lo = std::min(this->dst, other.src);
@ -135,33 +194,39 @@ MemoryTranslation::operator+=(const MemoryTranslation& other)
 	                          ? (this_dst_high - other_src_high)
 	                          : (other_src_high - this_dst_high);

-	const uintptr_t diff_lo = (this->dst > other.src)
+	const bool otherSrcIsSmaller = this->dst > other.src;
+	const uintptr_t diff_lo = (otherSrcIsSmaller)
 	                          ? (this->dst - other.src)
 	                          : (other.src - this->dst);

-	const size_t size = (hi - lo) - diff_hi - diff_lo;
+	logger->debug("hi:             {:#x}", hi);
+	logger->debug("lo:             {:#x}", lo);
+	logger->debug("diff_hi:        {:#x}", diff_hi);
+	logger->debug("diff_lo:        {:#x}", diff_lo);

-	logger->debug("this->src:      0x{:x}", this->src);
-	logger->debug("this->dst:      0x{:x}", this->dst);
-	logger->debug("this->size:     0x{:x}", this->size);
-	logger->debug("other.src:      0x{:x}", other.src);
-	logger->debug("other.dst:      0x{:x}", other.dst);
-	logger->debug("other.size:     0x{:x}", other.size);
-	logger->debug("this_dst_high:  0x{:x}", this_dst_high);
-	logger->debug("other_src_high: 0x{:x}", other_src_high);
-	logger->debug("hi:             0x{:x}", hi);
-	logger->debug("lo:             0x{:x}", lo);
-	logger->debug("diff_hi:        0x{:x}", diff_hi);
-	logger->debug("diff_hi:        0x{:x}", diff_lo);
-	logger->debug("size:           0x{:x}", size);
+	// new size of aperture, can only stay or shrink
+	this->size = (hi - lo) - diff_hi - diff_lo;

-	this->src += other.src;
-	this->dst += other.dst;
-	this->size = size;
+	// new translation will come out other's destination (by default)
+	this->dst = other.dst;

-	logger->debug("result src:     0x{:x}", this->src);
-	logger->debug("result dst:     0x{:x}", this->dst);
-	logger->debug("result size:    0x{:x}", this->size);
+	// the source stays the same and can only increase with merged translations
+	this->src = this->src;
+
+	if(otherSrcIsSmaller) {
+		// other mapping starts at lower addresses, so we actually arrive at
+		// higher addresses
+		this->dst += diff_lo;
+	} else {
+		// other mapping starts at higher addresses than this, so we have to
+		// increase the start
+		// NOTE: for addresses equality, this just adds 0
+		this->src += diff_lo;
+	}
+
+	logger->debug("result src:     {:#x}", this->src);
+	logger->debug("result dst:     {:#x}", this->dst);
+	logger->debug("result size:    {:#x}", this->size);

 	return *this;
 }
--- a/fpga/lib/gpu/CMakeLists.txt
+++ b/fpga/lib/gpu/CMakeLists.txt
@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 3.8)
+
+project(villas-gpu
+	VERSION 1.0
+	DESCRIPTION "VILLASgpu"
+	LANGUAGES C CXX CUDA)
+
+add_subdirectory(gdrcopy)
+
+add_library(villas-gpu SHARED gpu.cpp kernels.cu)
+
+target_compile_options(villas-gpu PRIVATE -g)
+
+set_source_files_properties(gpu.cpp PROPERTIES
+	LANGUAGE CUDA)
+
+target_include_directories(villas-gpu PRIVATE /opt/cuda/include)
+
+target_link_libraries(villas-gpu
+	PRIVATE villas-common gdrapi cuda)
+
+target_include_directories(villas-gpu
+	PUBLIC
+	    ${CMAKE_CURRENT_LIST_DIR}/include
+	PRIVATE
+	    ${CMAKE_CURRENT_LIST_DIR})
--- a/fpga/lib/gpu/gdrcopy
+++ b/fpga/lib/gpu/gdrcopy
@ -0,0 +1 @@
+Subproject commit 0441daa447b80260c4e11096f03e88f7be08bfa2
--- a/fpga/lib/gpu/gpu.cpp
+++ b/fpga/lib/gpu/gpu.cpp
@ -0,0 +1,474 @@
+#include <cstdio>
+#include <cstdint>
+#include <sys/mman.h>
+
+#include <memory>
+#include <algorithm>
+
+#include <villas/gpu.hpp>
+#include <villas/log.hpp>
+#include <villas/kernel/pci.h>
+#include <memory_manager.hpp>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <gdrapi.h>
+
+#include "kernels.hpp"
+
+namespace villas {
+namespace gpu {
+
+static GpuFactory gpuFactory;
+
+GpuAllocator::GpuAllocator(Gpu& gpu) :
+    BaseAllocator(gpu.masterPciEAddrSpaceId),
+    gpu(gpu)
+{
+	free = [&](MemoryBlock* mem) {
+		cudaSetDevice(gpu.gpuId);
+		if(cudaFree(reinterpret_cast<void*>(mem->getOffset())) != cudaSuccess) {
+			logger->warn("cudaFree() failed for {:#x} of size {:#x}",
+			             mem->getOffset(), mem->getSize());
+		}
+
+		removeMemoryBlock(*mem);
+	};
+}
+
+std::string
+villas::gpu::GpuAllocator::getName() const
+{
+	std::stringstream name;
+	name << "GpuAlloc" << getAddrSpaceId();
+	return name.str();
+}
+
+
+GpuFactory::GpuFactory() :
+    Plugin(Plugin::Type::Gpu, "GPU")
+{
+	logger = loggerGetOrCreate("GpuFactory");
+}
+
+// required to be defined here for PIMPL to compile
+Gpu::~Gpu()
+{
+	auto& mm = MemoryManager::get();
+	mm.removeAddressSpace(masterPciEAddrSpaceId);
+}
+
+
+// we use PIMPL in order to hide gdrcopy types from the public header
+class Gpu::impl {
+public:
+	gdr_t gdr;
+	struct pci_device pdev;
+};
+
+std::string Gpu::getName() const
+{
+	cudaDeviceProp deviceProp;
+	if(cudaGetDeviceProperties(&deviceProp, gpuId) != cudaSuccess) {
+		// logger not yet availabe
+		loggerGetOrCreate("Gpu")->error("Cannot retrieve properties for GPU {}", gpuId);
+		throw std::exception();
+	}
+
+	std::stringstream name;
+	name << "gpu" << gpuId << "(" << deviceProp.name << ")";
+
+	return name.str();
+}
+
+bool Gpu::registerIoMemory(const MemoryBlock& mem)
+{
+	auto& mm = MemoryManager::get();
+	const auto pciAddrSpaceId = mm.getPciAddressSpace();
+
+	// Check if we need to map anything at all, maybe it's already reachable
+	try {
+		// TODO: there might already be a path through the graph, but there's no
+		// overlapping window, so this will fail badly!
+		auto translation = mm.getTranslation(masterPciEAddrSpaceId,
+		                                     mem.getAddrSpaceId());
+		if(translation.getSize() >= mem.getSize()) {
+			// there is already a sufficient path
+			logger->debug("Already mapped through another mapping");
+			return true;
+		} else {
+			logger->warn("There's already a mapping, but too small");
+		}
+	} catch(const std::out_of_range&) {
+		// not yet reachable, that's okay, proceed
+	}
+
+
+	// In order to register IO memory with CUDA, it has to be mapped to the VA
+	// space of the current process (requirement of CUDA API). Check this now.
+	MemoryManager::AddressSpaceId mappedBaseAddrSpaceId;
+	try {
+		auto path = mm.findPath(mm.getProcessAddressSpace(), mem.getAddrSpaceId());
+		// first node in path is the mapped memory space whose virtual address
+		// we need to hand to CUDA
+		mappedBaseAddrSpaceId = path.front();
+	} catch (const std::out_of_range&) {
+		logger->error("Memory not reachable from process, but required by CUDA");
+		return false;
+	}
+
+	// determine the base address of the mapped memory region needed by CUDA
+	const auto translationProcess = mm.getTranslationFromProcess(mappedBaseAddrSpaceId);
+	const uintptr_t baseAddrForProcess = translationProcess.getLocalAddr(0);
+
+
+	// Now check that the memory is also reachable via PCIe bus, otherwise GPU
+	// has no means to access it.
+	uintptr_t baseAddrOnPci;
+	size_t	sizeOnPci;
+	try {
+		auto translationPci = mm.getTranslation(pciAddrSpaceId,
+		                                        mappedBaseAddrSpaceId);
+		baseAddrOnPci = translationPci.getLocalAddr(0);
+		sizeOnPci = translationPci.getSize();
+	} catch(const std::out_of_range&) {
+		logger->error("Memory is not reachable via PCIe bus");
+		return false;
+	}
+
+	if(sizeOnPci < mem.getSize()) {
+		logger->warn("VA mapping of IO memory is too small: {:#x} instead of {:#x} bytes",
+		              sizeOnPci, mem.getSize());
+		logger->warn("If something later on fails or behaves strangely, this might be the cause!");
+	}
+
+
+	cudaSetDevice(gpuId);
+
+	auto baseAddrVA = reinterpret_cast<void*>(baseAddrForProcess);
+	if(cudaHostRegister(baseAddrVA, sizeOnPci, cudaHostRegisterIoMemory) != cudaSuccess) {
+		logger->error("Cannot register IO memory for block {}", mem.getAddrSpaceId());
+		return false;
+	}
+
+	void* devicePointer = nullptr;
+	if(cudaHostGetDevicePointer(&devicePointer, baseAddrVA, 0) != cudaSuccess) {
+		logger->error("Cannot retrieve device pointer for IO memory");
+		return false;
+	}
+
+	mm.createMapping(reinterpret_cast<uintptr_t>(devicePointer), baseAddrOnPci,
+	                 sizeOnPci, "CudaIoMem", masterPciEAddrSpaceId, pciAddrSpaceId);
+
+	return true;
+}
+
+bool
+Gpu::registerHostMemory(const MemoryBlock& mem)
+{
+	auto& mm = MemoryManager::get();
+
+	auto translation = mm.getTranslationFromProcess(mem.getAddrSpaceId());
+	auto localBase = reinterpret_cast<void*>(translation.getLocalAddr(0));
+
+	int ret = cudaHostRegister(localBase, mem.getSize(), 0);
+	if(ret != cudaSuccess) {
+		logger->error("Cannot register memory block {} addr={:p} size={:#x} to CUDA: ret={}",
+		              mem.getAddrSpaceId(), localBase, mem.getSize(), ret);
+		return false;
+	}
+
+	void* devicePointer = nullptr;
+	ret = cudaHostGetDevicePointer(&devicePointer, localBase, 0);
+	if(ret != cudaSuccess) {
+		logger->error("Cannot retrieve device pointer for IO memory: ret={}", ret);
+		return false;
+	}
+
+	mm.createMapping(reinterpret_cast<uintptr_t>(devicePointer), 0, mem.getSize(),
+	                 "CudaHostMem", masterPciEAddrSpaceId, mem.getAddrSpaceId());
+
+	return true;
+}
+
+bool Gpu::makeAccessibleToPCIeAndVA(const MemoryBlock& mem)
+{
+	if(pImpl->gdr == nullptr) {
+		logger->warn("GDRcopy not available");
+		return false;
+	}
+
+	auto& mm = MemoryManager::get();
+
+	try {
+		auto path = mm.findPath(masterPciEAddrSpaceId, mem.getAddrSpaceId());
+		// if first hop is the PCIe bus, we know that memory is off-GPU
+		if(path.front() == mm.getPciAddressSpace()) {
+			throw std::out_of_range("Memory block is outside of this GPU");
+		}
+
+	} catch (const std::out_of_range&) {
+		logger->error("Trying to map non-GPU memory block");
+		return false;
+	}
+
+	logger->debug("retrieve complete device pointer from point of view of GPU");
+	// retrieve complete device pointer from point of view of GPU
+	auto translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                     mem.getAddrSpaceId());
+	CUdeviceptr devptr = translation.getLocalAddr(0);
+
+	int ret;
+
+	// required to set this flag before mapping
+	unsigned int enable = 1;
+	ret = cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, devptr);
+	if(ret != CUDA_SUCCESS) {
+		logger->error("Cannot set pointer attributes on memory block {}: {}",
+		              mem.getAddrSpaceId(), ret);
+		return false;
+	}
+
+	gdr_mh_t mh;
+	ret = gdr_pin_buffer(pImpl->gdr, devptr, mem.getSize(), 0, 0, &mh);
+	if(ret != 0) {
+		logger->error("Cannot pin memory block {} via gdrcopy: {}",
+		              mem.getAddrSpaceId(), ret);
+		return false;
+	}
+
+	void* bar = nullptr;
+	ret = gdr_map(pImpl->gdr, mh, &bar, mem.getSize());
+	if(ret != 0) {
+		logger->error("Cannot map memory block {} via gdrcopy: {}",
+		              mem.getAddrSpaceId(), ret);
+		return false;
+	}
+
+	gdr_info_t info;
+	ret = gdr_get_info(pImpl->gdr, mh, &info);
+	if(ret != 0) {
+		logger->error("Cannot get info for mapping of memory block {}: {}",
+		              mem.getAddrSpaceId(), ret);
+		return false;
+	}
+
+	const uintptr_t offset = info.va - devptr;
+	const uintptr_t userPtr = reinterpret_cast<uintptr_t>(bar) + offset;
+
+	logger->debug("BAR ptr:          {:p}",  bar);
+	logger->debug("info.va:          {:#x}", info.va);
+	logger->debug("info.mapped_size: {:#x}", info.mapped_size);
+	logger->debug("info.page_size:   {:#x}", info.page_size);
+	logger->debug("offset:           {:#x}", offset);
+	logger->debug("user pointer:     {:#x}", userPtr);
+
+	// mapping to acceses memory block from process
+	mm.createMapping(userPtr, 0, info.mapped_size, "GDRcopy",
+	                 mm.getProcessAddressSpace(), mem.getAddrSpaceId());
+
+	// retrieve bus address
+	uint64_t addr[8];
+	ret = gdr_map_dma(pImpl->gdr, mh, 3, 0, 0, addr, 8);
+
+	for(int i = 0; i < ret; i++) {
+		logger->debug("DMA addr[{}]:      {:#x}", i, addr[i]);
+	}
+
+	if(ret != 1) {
+		logger->error("Only one DMA address per block supported at the moment");
+		return false;
+	}
+
+	// mapping to access memory block from peer devices via PCIe
+	mm.createMapping(addr[0], 0, mem.getSize(), "GDRcopyDMA",
+	                 mm.getPciAddressSpace(), mem.getAddrSpaceId());
+
+	return true;
+}
+
+bool
+Gpu::makeAccessibleFromPCIeOrHostRam(const MemoryBlock& mem)
+{
+	// Check which kind of memory this is and where it resides
+	// There are two possibilities:
+	//  - Host memory not managed by CUDA
+	//  - IO memory somewhere on the PCIe bus
+
+	auto& mm = MemoryManager::get();
+
+	bool isIoMemory = false;
+	try {
+		auto path = mm.findPath(mm.getPciAddressSpace(), mem.getAddrSpaceId());
+		isIoMemory = true;
+	} catch(const std::out_of_range&) {
+		// not reachable via PCI -> not IO memory
+	}
+
+	if(isIoMemory) {
+		logger->debug("Memory block {} is assumed to be IO memory",
+		              mem.getAddrSpaceId());
+
+		return registerIoMemory(mem);
+	} else {
+		logger->debug("Memory block {} is assumed to be non-CUDA host memory",
+		              mem.getAddrSpaceId());
+
+		return registerHostMemory(mem);
+	}
+}
+
+void Gpu::memcpySync(const MemoryBlock& src, const MemoryBlock& dst, size_t size)
+{
+	auto& mm = MemoryManager::get();
+
+	auto src_translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                         src.getAddrSpaceId());
+	const void* src_buf = reinterpret_cast<void*>(src_translation.getLocalAddr(0));
+
+	auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                         dst.getAddrSpaceId());
+	void* dst_buf = reinterpret_cast<void*>(dst_translation.getLocalAddr(0));
+
+	cudaSetDevice(gpuId);
+	cudaMemcpy(dst_buf, src_buf, size, cudaMemcpyDefault);
+}
+
+void Gpu::memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size)
+{
+	auto& mm = MemoryManager::get();
+
+	auto src_translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                         src.getAddrSpaceId());
+	auto src_buf = reinterpret_cast<uint8_t*>(src_translation.getLocalAddr(0));
+
+	auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId,
+	                                         dst.getAddrSpaceId());
+	auto dst_buf = reinterpret_cast<uint8_t*>(dst_translation.getLocalAddr(0));
+
+	cudaSetDevice(gpuId);
+	kernel_memcpy<<<1, 1>>>(dst_buf, src_buf, size);
+	cudaDeviceSynchronize();
+}
+
+
+std::unique_ptr<villas::MemoryBlock, villas::MemoryBlock::deallocator_fn>
+GpuAllocator::allocateBlock(size_t size)
+{
+	cudaSetDevice(gpu.gpuId);
+
+	void* addr;
+	if(cudaSuccess != cudaMalloc(&addr, size)) {
+		logger->error("cudaMalloc(..., size={}) failed", size);
+		throw std::bad_alloc();
+	}
+
+	auto& mm = MemoryManager::get();
+
+	// assemble name for this block
+	std::stringstream name;
+	name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
+
+	auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
+	auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
+
+	const auto localAddr = reinterpret_cast<uintptr_t>(addr);
+	std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
+	        mem(new MemoryBlock(localAddr, size, blockAddrSpaceId), this->free);
+
+	insertMemoryBlock(*mem);
+
+	gpu.makeAccessibleToPCIeAndVA(*mem);
+
+	return mem;
+}
+
+
+Gpu::Gpu(int gpuId) :
+    pImpl{std::make_unique<impl>()},
+    gpuId(gpuId)
+{
+	logger = loggerGetOrCreate(getName());
+
+	pImpl->gdr = gdr_open();
+	if(pImpl->gdr == nullptr) {
+		logger->warn("No GDRcopy support enabled, cannot open /dev/gdrdrv");
+	}
+}
+
+bool Gpu::init()
+{
+	auto& mm = MemoryManager::get();
+
+	const auto gpuPciEAddrSpaceName = mm.getMasterAddrSpaceName(getName(), "PCIe");
+	masterPciEAddrSpaceId = mm.getOrCreateAddressSpace(gpuPciEAddrSpaceName);
+
+	allocator = std::make_unique<GpuAllocator>(*this);
+
+	cudaDeviceProp deviceProp;
+	cudaGetDeviceProperties(&deviceProp, gpuId);
+
+	pImpl->pdev.slot = {
+	    deviceProp.pciDomainID,
+	    deviceProp.pciBusID,
+	    deviceProp.pciDeviceID,
+	    0};
+
+	struct pci_region* pci_regions = nullptr;
+	const size_t pci_num_regions = pci_get_regions(&pImpl->pdev, &pci_regions);
+	for(size_t i = 0; i < pci_num_regions; i++) {
+		const size_t region_size = pci_regions[i].end - pci_regions[i].start + 1;
+		logger->info("BAR{}: bus addr={:#x} size={:#x}",
+		             pci_regions[i].num, pci_regions[i].start, region_size);
+
+		char name[] = "BARx";
+		name[3] = '0' + pci_regions[i].num;
+
+		auto gpuBarXAddrSpaceName = mm.getSlaveAddrSpaceName(getName(), name);
+		auto gpuBarXAddrSpaceId = mm.getOrCreateAddressSpace(gpuBarXAddrSpaceName);
+
+		mm.createMapping(pci_regions[i].start, 0, region_size,
+		                 std::string("PCI-") + name,
+		                 mm.getPciAddressSpace(), gpuBarXAddrSpaceId);
+	}
+
+	free(pci_regions);
+
+	return true;
+}
+
+
+std::list<std::unique_ptr<Gpu>>
+GpuFactory::make()
+{
+	int deviceCount = 0;
+	cudaGetDeviceCount(&deviceCount);
+
+	std::list<std::unique_ptr<Gpu>> gpuList;
+
+	for(int gpuId = 0; gpuId < deviceCount; gpuId++) {
+		if(cudaSetDevice(gpuId) != cudaSuccess) {
+			logger->warn("Cannot activate GPU {}", gpuId);
+			continue;
+		}
+
+		auto gpu = std::make_unique<Gpu>(gpuId);
+
+		if(not gpu->init()) {
+			logger->warn("Cannot initialize GPU {}", gpuId);
+			continue;
+		}
+
+		gpuList.emplace_back(std::move(gpu));
+	}
+
+	logger->info("Initialized {} GPUs", gpuList.size());
+	for(auto& gpu : gpuList) {
+		logger->debug(" - {}", gpu->getName());
+	}
+
+	return gpuList;
+}
+
+} // namespace villas
+} // namespace gpu
--- a/fpga/lib/gpu/include/villas/gpu.hpp
+++ b/fpga/lib/gpu/include/villas/gpu.hpp
@ -0,0 +1,87 @@
+#pragma once
+
+#include <sstream>
+
+#include <plugin.hpp>
+#include <memory_manager.hpp>
+#include <memory.hpp>
+#include <villas/log.hpp>
+
+
+namespace villas {
+namespace gpu {
+
+class GpuAllocator;
+
+class Gpu {
+	friend GpuAllocator;
+public:
+	Gpu(int gpuId);
+	~Gpu();
+
+	bool init();
+
+	std::string getName() const;
+
+	GpuAllocator& getAllocator() const
+	{ return *allocator; }
+
+
+	bool makeAccessibleToPCIeAndVA(const MemoryBlock& mem);
+
+	/// Make some memory block accssible for this GPU
+	bool makeAccessibleFromPCIeOrHostRam(const MemoryBlock& mem);
+
+	void memcpySync(const MemoryBlock& src, const MemoryBlock& dst, size_t size);
+
+	void memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size);
+
+private:
+	bool registerIoMemory(const MemoryBlock& mem);
+	bool registerHostMemory(const MemoryBlock& mem);
+
+private:
+	class impl;
+	std::unique_ptr<impl> pImpl;
+
+	// master, will be used to derived slave addr spaces for allocation
+	MemoryManager::AddressSpaceId masterPciEAddrSpaceId;
+
+	MemoryManager::AddressSpaceId slaveMemoryAddrSpaceId;
+
+	SpdLogger logger;
+
+	int gpuId;
+
+	std::unique_ptr<GpuAllocator> allocator;
+};
+
+
+class GpuAllocator : public BaseAllocator<GpuAllocator> {
+public:
+	GpuAllocator(Gpu& gpu);
+
+	std::string getName() const;
+
+	std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
+	allocateBlock(size_t size);
+
+private:
+	Gpu& gpu;
+};
+
+class GpuFactory : public Plugin {
+public:
+	GpuFactory();
+
+	std::list<std::unique_ptr<Gpu>>
+	make();
+
+	void run(void*);
+
+private:
+	SpdLogger logger;
+};
+
+} // namespace villas
+} // namespace gpu
--- a/fpga/lib/gpu/kernels.cu
+++ b/fpga/lib/gpu/kernels.cu
@ -0,0 +1,42 @@
+#include <stdio.h>
+
+#include <villas/gpu.hpp>
+#include "kernels.hpp"
+
+#include "cuda_runtime.h"
+#include <cuda.h>
+
+namespace villas {
+namespace gpu {
+
+
+__global__ void
+kernel_mailbox(volatile uint32_t *mailbox, volatile uint32_t* counter)
+{
+	printf("[gpu] hello!\n");
+	printf("[gpu] mailbox: %p\n", mailbox);
+
+	printf("[kernel] started\n");
+
+	while(1) {
+		if (*mailbox == 1) {
+			*mailbox = 0;
+			printf("[gpu] counter = %d\n", *counter);
+			break;
+		}
+	}
+
+	printf("[gpu] quit\n");
+}
+
+__global__ void
+kernel_memcpy(volatile uint8_t* dst, volatile uint8_t* src, size_t length)
+{
+	while(length > 0) {
+		*dst++ = *src++;
+		length--;
+	}
+}
+
+} // namespace villas
+} // namespace gpu
--- a/fpga/lib/gpu/kernels.hpp
+++ b/fpga/lib/gpu/kernels.hpp
@ -0,0 +1,16 @@
+#pragma once
+
+#include <cstdint>
+#include <cuda_runtime.h>
+
+namespace villas {
+namespace gpu {
+
+__global__ void
+kernel_mailbox(volatile uint32_t *mailbox, volatile uint32_t* counter);
+
+__global__ void
+kernel_memcpy(volatile uint8_t* dst, volatile uint8_t* src, size_t length);
+
+} // namespace villas
+} // namespace gpu
--- a/fpga/lib/ip.cpp
+++ b/fpga/lib/ip.cpp
@ -223,8 +223,8 @@ IpCoreFactory::make(PCIeCard* card, json_t *json_ips)
 					json_t* json_block;
 					json_object_foreach(json_instance, block_name, json_block) {

-						unsigned int base, high, size;
-						int ret = json_unpack(json_block, "{ s: i, s: i, s: i }",
+						json_int_t base, high, size;
+						int ret = json_unpack(json_block, "{ s: I, s: I, s: I }",
 						                      "baseaddr", &base,
 						                      "highaddr", &high,
 						                      "size", &size);
--- a/fpga/lib/ips/dma.cpp
+++ b/fpga/lib/ips/dma.cpp
@ -132,8 +132,14 @@ Dma::reset()


 bool
-Dma::pingPong(const MemoryBlock& src, const MemoryBlock& dst, size_t len)
+Dma::memcpy(const MemoryBlock& src, const MemoryBlock& dst, size_t len)
 {
+	if(len == 0)
+		return true;
+
+	if(not connectLoopback())
+		return false;
+
 	if(this->read(dst, len) == 0)
 		return false;

@ -153,17 +159,14 @@ Dma::pingPong(const MemoryBlock& src, const MemoryBlock& dst, size_t len)
 size_t
 Dma::write(const MemoryBlock& mem, size_t len)
 {
-	// make sure memory is reachable
-	if(not card->mapMemoryBlock(mem)) {
-		logger->error("Memory not accessible by DMA");
-		return 0;
-	}
-
 	auto& mm = MemoryManager::get();
+
+	// user has to make sure that memory is accessible, otherwise this will throw
 	auto translation = mm.getTranslation(busMasterInterfaces[mm2sInterface],
 	                                     mem.getAddrSpaceId());
 	const void* buf = reinterpret_cast<void*>(translation.getLocalAddr(0));

+	logger->debug("Write to address: {:p}", buf);
 	return hasScatterGather() ? writeSG(buf, len) : writeSimple(buf, len);
 }

@ -171,17 +174,14 @@ Dma::write(const MemoryBlock& mem, size_t len)
 size_t
 Dma::read(const MemoryBlock& mem, size_t len)
 {
-	// make sure memory is reachable
-	if(not card->mapMemoryBlock(mem)) {
-		logger->error("Memory not accessible by DMA");
-		return 0;
-	}
-
 	auto& mm = MemoryManager::get();
+
+	// user has to make sure that memory is accessible, otherwise this will throw
 	auto translation = mm.getTranslation(busMasterInterfaces[s2mmInterface],
 	                                     mem.getAddrSpaceId());
 	void* buf = reinterpret_cast<void*>(translation.getLocalAddr(0));

+	logger->debug("Read from address: {:p}", buf);
 	return hasScatterGather() ? readSG(buf, len) : readSimple(buf, len);
 }

@ -350,6 +350,48 @@ Dma::readCompleteSimple()
 }


+bool
+Dma::makeAccesibleFromVA(const MemoryBlock& mem)
+{
+	// only symmetric mapping supported currently
+	if(isMemoryBlockAccesible(mem, s2mmInterface) and
+	   isMemoryBlockAccesible(mem, mm2sInterface)) {
+		return true;
+	}
+
+	// try mapping via FPGA-card (VFIO)
+	if(not card->mapMemoryBlock(mem)) {
+		logger->error("Memory not accessible by DMA");
+		return false;
+	}
+
+	// sanity-check if mapping worked, this shouldn't be neccessary
+	if(not isMemoryBlockAccesible(mem, s2mmInterface) or
+	   not isMemoryBlockAccesible(mem, mm2sInterface)) {
+		logger->error("Mapping memory via card didn't work, but reported success?!");
+		return false;
+	}
+
+	return true;
+}
+
+
+bool
+Dma::isMemoryBlockAccesible(const MemoryBlock& mem, const std::string& interface)
+{
+	auto& mm = MemoryManager::get();
+
+	try {
+		mm.findPath(getMasterAddrSpaceByInterface(interface), mem.getAddrSpaceId());
+	} catch(const std::out_of_range&) {
+		// not (yet) accessible
+		return false;
+	}
+
+	return true;
+}
+
+
 } // namespace ip
 } // namespace fpga
 } // namespace villas
--- a/fpga/lib/ips/pcie.cpp
+++ b/fpga/lib/ips/pcie.cpp
@ -42,14 +42,23 @@ AxiPciExpressBridge::init()

 	// Throw an exception if the is no bus master interface and thus no
 	// address space we can use for translation -> error
-	const MemoryManager::AddressSpaceId myAddrSpaceid =
-	        busMasterInterfaces.at(axiInterface);
+	card->addrSpaceIdHostToDevice = busMasterInterfaces.at(axiInterface);

-	// Create an identity mapping from the FPGA card to this IP as an entry
-	// point to all other IPs in the FPGA, because Vivado will generate a
-	// memory view for this bridge that can see all others.
-	MemoryManager::get().createMapping(0x00, 0x00, SIZE_MAX, "PCIeBridge",
-	                                   card->addrSpaceIdHostToDevice, myAddrSpaceid);
+	/* Map PCIe BAR0 via VFIO */
+	const void* bar0_mapped = card->vfioDevice->regionMap(VFIO_PCI_BAR0_REGION_INDEX);
+	if (bar0_mapped == MAP_FAILED) {
+		logger->error("Failed to mmap() BAR0");
+		return false;
+	}
+
+	// determine size of BAR0 region
+	const size_t bar0_size = card->vfioDevice->regionGetSize(VFIO_PCI_BAR0_REGION_INDEX);
+
+	// create a mapping from process address space to the FPGA card via vfio
+	mm.createMapping(reinterpret_cast<uintptr_t>(bar0_mapped),
+	                 0, bar0_size, "VFIO-H2D",
+	                 mm.getProcessAddressSpace(),
+	                 card->addrSpaceIdHostToDevice);


 	/* Make PCIe (IOVA) address space available to FPGA via BAR0 */
@ -63,6 +72,102 @@ AxiPciExpressBridge::init()
 	card->addrSpaceIdDeviceToHost =
 	        mm.getOrCreateAddressSpace(addrSpaceNameDeviceToHost);

+	auto pciAddrSpaceId = mm.getPciAddressSpace();
+
+	struct pci_region* pci_regions = nullptr;
+	size_t num_regions = pci_get_regions(card->pdev, &pci_regions);
+
+	for(size_t i = 0; i < num_regions; i++) {
+		const size_t region_size = pci_regions[i].end - pci_regions[i].start + 1;
+
+		char barName[] = "BARx";
+		barName[3] = '0' + pci_regions[i].num;
+		auto pciBar = pcieToAxiTranslations.at(barName);
+
+
+		logger->info("PCI-BAR{}: bus addr={:#x} size={:#x}",
+		             pci_regions[i].num, pci_regions[i].start, region_size);
+		logger->info("PCI-BAR{}: AXI translation offset {:#x}",
+		             i, pciBar.translation);
+
+		mm.createMapping(pci_regions[i].start, pciBar.translation, region_size,
+		                 std::string("PCI-") + barName,
+		                 pciAddrSpaceId, card->addrSpaceIdHostToDevice);
+
+	}
+
+	if(pci_regions != nullptr) {
+		logger->debug("freeing pci regions");
+		free(pci_regions);
+	}
+
+
+	for(auto& [barName, axiBar] : axiToPcieTranslations) {
+		logger->info("AXI-{}: bus addr={:#x} size={:#x}",
+		             barName, axiBar.base, axiBar.size);
+		logger->info("AXI-{}: PCI translation offset: {:#x}",
+		             barName, axiBar.translation);
+
+		auto barXAddrSpaceName = mm.getSlaveAddrSpaceName(getInstanceName(), barName);
+		auto barXAddrSpaceId = mm.getOrCreateAddressSpace(barXAddrSpaceName);
+
+		// base is already incorporated into mapping of each IP by Vivado, so
+		// the mapping src has to be 0
+		mm.createMapping(0, axiBar.translation, axiBar.size,
+		                 std::string("AXI-") + barName,
+		                 barXAddrSpaceId, pciAddrSpaceId);
+	}
+
+	return true;
+}
+
+bool
+AxiPciExpressBridgeFactory::configureJson(IpCore& ip, json_t* json_ip)
+{
+	auto logger = getLogger();
+	auto& pcie = reinterpret_cast<AxiPciExpressBridge&>(ip);
+
+	for(auto barType : std::list<std::string>{"axi_bars", "pcie_bars"}) {
+		json_t* json_bars = json_object_get(json_ip, barType.c_str());
+		if(not json_is_object(json_bars)) {
+			return false;
+		}
+
+		json_t* json_bar;
+		const char* bar_name;
+		json_object_foreach(json_bars, bar_name, json_bar) {
+			unsigned int translation;
+			int ret = json_unpack(json_bar, "{ s: i }", "translation", &translation);
+			if(ret != 0) {
+				logger->error("Cannot parse {}/{}", barType, bar_name);
+				return false;
+			}
+
+			if(barType == "axi_bars") {
+				json_int_t base, high, size;
+				int ret = json_unpack(json_bar, "{ s: I, s: I, s: I }",
+				                      "baseaddr", &base,
+				                      "highaddr", &high,
+				                      "size", &size);
+				if(ret != 0) {
+					logger->error("Cannot parse {}/{}", barType, bar_name);
+					return false;
+				}
+
+				pcie.axiToPcieTranslations[bar_name] = {
+				    .base = static_cast<uintptr_t>(base),
+				    .size = static_cast<size_t>(size),
+				    .translation = translation
+				};
+
+			} else {
+				pcie.pcieToAxiTranslations[bar_name] = {
+				    .translation = translation
+				};
+			}
+		}
+	}
+
 	return true;
 }

--- a/fpga/lib/kernel/pci.c
+++ b/fpga/lib/kernel/pci.c
@ -254,10 +254,78 @@ struct pci_device * pci_lookup_device(struct pci *p, struct pci_device *f)
 	return list_search(&p->devices, (cmp_cb_t) pci_device_compare, (void *) f);
 }

+size_t pci_get_regions(const struct pci_device *d, struct pci_region** regions)
+{
+	FILE* f;
+	char sysfs[1024];
+
+	assert(regions != NULL);
+
+	snprintf(sysfs, sizeof(sysfs), "%s/bus/pci/devices/%04x:%02x:%02x.%x/resource",
+	         SYSFS_PATH, d->slot.domain, d->slot.bus, d->slot.device, d->slot.function);
+
+	f = fopen(sysfs, "r");
+	if (!f)
+		serror("Failed to open resource mapping %s", sysfs);
+
+	struct pci_region _regions[8];
+	struct pci_region* cur_region = _regions;
+	size_t valid_regions = 0;
+
+	ssize_t bytesRead;
+	char* line = NULL;
+	size_t len = 0;
+
+	int region = 0;
+	// cap to 8 regions, just because we don't know how many may exist
+	while(region < 8 && (bytesRead = getline(&line, &len, f)) != -1) {
+		unsigned long long tokens[3];
+		char* s = line;
+		for(int i = 0; i < 3; i++) {
+			char* end;
+			tokens[i] = strtoull(s, &end, 16);
+			if(s == end) {
+				printf("Error parsing line %d of %s\n", region + 1, sysfs);
+				tokens[0] = tokens[1] = 0; // mark invalid
+				break;
+			}
+			s = end;
+		}
+
+		free(line);
+
+		// required for getline() to allocate a new buffer on the next iteration
+		line = NULL;
+		len = 0;
+
+		if(tokens[0] != tokens[1]) {
+			// this is a valid region
+			cur_region->num = region;
+			cur_region->start = tokens[0];
+			cur_region->end = tokens[1];
+			cur_region->flags = tokens[2];
+			cur_region++;
+			valid_regions++;
+		}
+
+		region++;
+	}
+
+	if(valid_regions > 0) {
+		const size_t len = valid_regions * sizeof (struct pci_region);
+		*regions = malloc(len);
+		memcpy(*regions, _regions, len);
+	}
+
+	return valid_regions;
+}
+
+
 int pci_get_driver(const struct pci_device *d, char *buf, size_t buflen)
 {
 	int ret;
 	char sysfs[1024], syml[1024];
+	memset(syml, 0, sizeof(syml));

 	snprintf(sysfs, sizeof(sysfs), "%s/bus/pci/devices/%04x:%02x:%02x.%x/driver", SYSFS_PATH,
 		d->slot.domain, d->slot.bus, d->slot.device, d->slot.function);
@ -305,6 +373,7 @@ int pci_get_iommu_group(const struct pci_device *d)
 {
 	int ret;
 	char *group, link[1024], sysfs[1024];
+	memset(link, 0, sizeof(link));

 	snprintf(sysfs, sizeof(sysfs), "%s/bus/pci/devices/%04x:%02x:%02x.%x/iommu_group", SYSFS_PATH,
 		d->slot.domain, d->slot.bus, d->slot.device, d->slot.function);
--- a/fpga/lib/kernel/vfio.cpp
+++ b/fpga/lib/kernel/vfio.cpp
@ -87,7 +87,7 @@ VfioContainer::VfioContainer()
 	}

 	/* Open VFIO API */
-	fd = open(VFIO_DEV("vfio"), O_RDWR);
+	fd = open(VFIO_DEV, O_RDWR);
 	if (fd < 0) {
 		logger->error("Failed to open VFIO container");
 		throw std::exception();
@ -108,12 +108,25 @@ VfioContainer::VfioContainer()
 			logger->error("Failed to get VFIO extensions");
 			throw std::exception();
 		}
-		else if (ret > 0)
+		else if (ret > 0) {
 			extensions |= (1 << i);
+		}
 	}

-	logger->debug("Version: {:#x}", version);
+	hasIommu = false;
+
+	if(not (extensions & (1 << VFIO_NOIOMMU_IOMMU))) {
+		if(not (extensions & (1 << VFIO_TYPE1_IOMMU))) {
+			logger->error("No supported IOMMU extension found");
+			throw std::exception();
+		} else {
+			hasIommu = true;
+		}
+	}
+
+	logger->debug("Version:    {:#x}", version);
 	logger->debug("Extensions: {:#x}", extensions);
+	logger->debug("IOMMU:      {}", hasIommu ? "yes" : "no");
 }


@ -289,7 +302,7 @@ VfioContainer::attachDevice(const pci_device* pdev)
 	}

 	/* Get IOMMU group of device */
-	int index = pci_get_iommu_group(pdev);
+	int index = isIommuEnabled() ? pci_get_iommu_group(pdev) : 0;
 	if (index < 0) {
 		logger->error("Failed to get IOMMU group of device");
 		throw std::exception();
@ -319,6 +332,11 @@ VfioContainer::memoryMap(uintptr_t virt, uintptr_t phys, size_t length)
 {
 	int ret;

+	if(not hasIommu) {
+		logger->error("DMA mapping not supported without IOMMU");
+		return UINTPTR_MAX;
+	}
+
 	if (length & 0xFFF) {
 		length += 0x1000;
 		length &= ~0xFFF;
@ -363,6 +381,10 @@ VfioContainer::memoryUnmap(uintptr_t phys, size_t length)
 {
 	int ret;

+	if(not hasIommu) {
+		return true;
+	}
+
 	struct vfio_iommu_type1_dma_unmap dmaUnmap;
 	dmaUnmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
 	dmaUnmap.flags = 0;
@ -390,7 +412,7 @@ VfioContainer::getOrAttachGroup(int index)
 	}

 	// group not yet part of this container, so acquire ownership
-	auto group = VfioGroup::attach(fd, index);
+	auto group = VfioGroup::attach(*this, index);
 	if(not group) {
 		logger->error("Failed to attach to IOMMU group: {}", index);
 		throw std::exception();
@ -516,6 +538,7 @@ VfioDevice::pciHotReset()

 	const size_t reset_infolen = sizeof(struct vfio_pci_hot_reset_info) +
 	                             sizeof(struct vfio_pci_dependent_device) * 64;
+
 	auto reset_info = reinterpret_cast<struct vfio_pci_hot_reset_info*>
 	                    (calloc(1, reset_infolen));

@ -540,6 +563,8 @@ VfioDevice::pciHotReset()
 		}
 	}

+	free(reset_info);
+
 	const size_t resetlen = sizeof(struct vfio_pci_hot_reset) +
 	                        sizeof(int32_t) * 1;
 	auto reset = reinterpret_cast<struct vfio_pci_hot_reset*>
@ -549,10 +574,15 @@ VfioDevice::pciHotReset()
 	reset->count = 1;
 	reset->group_fds[0] = this->group.fd;

-	const bool success = ioctl(this->fd, VFIO_DEVICE_PCI_HOT_RESET, reset) == 0;
+	int ret = ioctl(this->fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
+	const bool success = (ret == 0);

 	free(reset);
-	free(reset_info);
+
+	if(not success and not group.container->isIommuEnabled()) {
+		logger->info("PCI hot reset failed, but this is expected without IOMMU");
+		return true;
+	}

 	return success;
 }
@ -713,13 +743,18 @@ VfioGroup::~VfioGroup()


 std::unique_ptr<VfioGroup>
-VfioGroup::attach(int containerFd, int groupIndex)
+VfioGroup::attach(VfioContainer& container, int groupIndex)
 {
 	std::unique_ptr<VfioGroup> group { new VfioGroup(groupIndex) };

+	group->container = &container;
+
 	/* Open group fd */
 	std::stringstream groupPath;
-	groupPath << VFIO_DEV("") << groupIndex;
+	groupPath << VFIO_PATH
+	          << (container.isIommuEnabled() ? "" : "noiommu-")
+	          << groupIndex;
+
 	group->fd = open(groupPath.str().c_str(), O_RDWR);
 	if (group->fd < 0) {
 		logger->error("Failed to open VFIO group {}", group->index);
@ -729,18 +764,20 @@ VfioGroup::attach(int containerFd, int groupIndex)
 	logger->debug("VFIO group {} (fd {}) has path {}",
 	              groupIndex, group->fd, groupPath.str());

-	int ret;
-
 	/* Claim group ownership */
-	ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &containerFd);
+	int ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container.getFd());
 	if (ret < 0) {
 		logger->error("Failed to attach VFIO group {} to container fd {} (error {})",
-		              group->index, containerFd, ret);
+		              group->index, container.getFd(), ret);
 		return nullptr;
 	}

 	/* Set IOMMU type */
-	ret = ioctl(containerFd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
+	int iommu_type = container.isIommuEnabled() ?
+	                     VFIO_TYPE1_IOMMU :
+	                     VFIO_NOIOMMU_IOMMU;
+
+	ret = ioctl(container.getFd(), VFIO_SET_IOMMU, iommu_type);
 	if (ret < 0) {
 		logger->error("Failed to set IOMMU type of container: {}", ret);
 		return nullptr;
--- a/fpga/scripts/hwdef-parse.py
+++ b/fpga/scripts/hwdef-parse.py
@ -255,4 +255,27 @@ for bram in brams:
 	if instance in ips:
 		ips[instance]['size'] = int(size)

+pcies = root.xpath('.//MODULE[@MODTYPE="axi_pcie"]')
+for pcie in pcies:
+	instance = pcie.get('INSTANCE')
+	axi_bars = ips[instance].setdefault('axi_bars', {})
+	pcie_bars = ips[instance].setdefault('pcie_bars', {})
+
+	for from_bar, to_bar, from_bars in (('AXIBAR', 'PCIEBAR', axi_bars), ('PCIEBAR', 'AXIBAR', pcie_bars)):
+		from_bar_num = int(pcie.find('.//PARAMETER[@NAME="C_{}_NUM"]'.format(from_bar)).get('VALUE'))
+
+		for i in range(0, from_bar_num):
+			from_bar_to_bar_offset = int(pcie.find('.//PARAMETER[@NAME="C_{}2{}_{}"]'.format(from_bar, to_bar, i)).get('VALUE'), 16)
+			from_bars['BAR{}'.format(i)] = { 'translation': from_bar_to_bar_offset }
+
+			if from_bar == 'AXIBAR':
+				axi_bar_lo = int(pcie.find('.//PARAMETER[@NAME="C_{}_{}"]'.format(from_bar, i)).get('VALUE'), 16)
+				axi_bar_hi = int(pcie.find('.//PARAMETER[@NAME="C_{}_HIGHADDR_{}"]'.format(from_bar, i)).get('VALUE'), 16)
+				axi_bar_size = axi_bar_hi - axi_bar_lo + 1
+
+				axi_bar = from_bars['BAR{}'.format(i)]
+				axi_bar['baseaddr'] = axi_bar_lo
+				axi_bar['highaddr'] = axi_bar_hi
+				axi_bar['size'] = axi_bar_size
+
 print(json.dumps(ips, indent=2))
--- a/fpga/tests/CMakeLists.txt
+++ b/fpga/tests/CMakeLists.txt
@ -12,6 +12,10 @@ set(SOURCES
    graph.cpp
 )

+if(CMAKE_CUDA_COMPILER)
+	list(APPEND SOURCES gpu.cpp)
+endif()
+
 add_executable(unit-tests ${SOURCES})

 find_package(Criterion REQUIRED)
--- a/fpga/tests/dma.cpp
+++ b/fpga/tests/dma.cpp
@ -26,38 +26,52 @@ Test(fpga, dma, .description = "DMA")

 		auto dma = reinterpret_cast<villas::fpga::ip::Dma&>(*ip);

-		if(not dma.connectLoopback()) {
+		if(not dma.loopbackPossible()) {
+			logger->info("Loopback test not possible for {}", *ip);
 			continue;
 		}

 		count++;

-		if(not dma.loopbackPossible()) {
-			logger->info("Loopback test not possible for {}", *ip);
-			continue;
-		}
+
+		// Simple DMA can only transfer up to 4 kb due to PCIe page size burst
+		// limitation
+		size_t len = 4 * (1 << 10);

 		// find a block RAM IP to write to
 		auto bramIp = state.cards.front()->lookupIp(villas::fpga::Vlnv("xilinx.com:ip:axi_bram_ctrl:"));
 		auto bram = reinterpret_cast<villas::fpga::ip::Bram*>(bramIp);
 		cr_assert_not_null(bram, "Couldn't find BRAM");

-		// Simple DMA can only transfer up to 4 kb due to PCIe page size burst
-		// limitation
-		size_t len = 4 * (1 << 10);

 		/* Allocate memory to use with DMA */
-		auto src = villas::HostRam::getAllocator().allocate<char>(len);
-		auto dst = bram->getAllocator().allocate<char>(len);
+		auto src = villas::HostDmaRam::getAllocator().allocate<char>(len);
+		auto dst = villas::HostDmaRam::getAllocator().allocate<char>(len);
+
+		/* ... only works with IOMMU enabled currently */
+//		auto src = bram->getAllocator().allocate<char>(len);
+//		auto dst = bram->getAllocator().allocate<char>(len);
+
+		/* ... only works with IOMMU enabled currently */
+//		auto src = villas::HostRam::getAllocator().allocate<char>(len);
+//		auto dst = villas::HostRam::getAllocator().allocate<char>(len);
+
+		/* Make sure memory is accessible for DMA */
+		cr_assert(dma.makeAccesibleFromVA(src.getMemoryBlock()),
+		          "Source memory not accessible for DMA");
+		cr_assert(dma.makeAccesibleFromVA(dst.getMemoryBlock()),
+		          "Destination memory not accessible for DMA");

 		/* Get new random data */
 		const size_t lenRandom = read_random(&src, len);
 		cr_assert(len == lenRandom, "Failed to get random data");

+
 		/* Start transfer */
-		cr_assert(dma.pingPong(src.getMemoryBlock(), dst.getMemoryBlock(), len),
+		cr_assert(dma.memcpy(src.getMemoryBlock(), dst.getMemoryBlock(), len),
 		          "DMA ping pong failed");

+
 		/* Compare data */
 		cr_assert(memcmp(&src, &dst, len) == 0, "Data not equal");

--- a/fpga/tests/gpu.cpp
+++ b/fpga/tests/gpu.cpp
@ -0,0 +1,129 @@
+#include <criterion/criterion.h>
+
+#include <map>
+#include <string>
+
+#include <villas/log.hpp>
+#include <villas/fpga/card.hpp>
+#include <villas/fpga/ips/dma.hpp>
+#include <villas/fpga/ips/bram.hpp>
+
+#include <villas/utils.h>
+
+#include "global.hpp"
+
+#include <villas/memory.hpp>
+#include <villas/gpu.hpp>
+
+
+Test(fpga, gpu_dma, .description = "GPU DMA tests")
+{
+	auto logger = loggerGetOrCreate("unittest:dma");
+
+	auto& card = state.cards.front();
+
+	villas::Plugin* plugin = villas::Plugin::lookup(villas::Plugin::Type::Gpu, "");
+	auto gpuPlugin = dynamic_cast<villas::gpu::GpuFactory*>(plugin);
+	cr_assert_not_null(gpuPlugin, "No GPU plugin found");
+
+	auto gpus = gpuPlugin->make();
+	cr_assert(gpus.size() > 0, "No GPUs found");
+
+	// just get first cpu
+	auto& gpu = gpus.front();
+
+	size_t count = 0;
+	for(auto& ip : card->ips) {
+		// skip non-dma IPs
+		if(*ip != villas::fpga::Vlnv("xilinx.com:ip:axi_bram_ctrl:"))
+			continue;
+
+		logger->info("Testing {}", *ip);
+
+		auto bram = reinterpret_cast<villas::fpga::ip::Bram*>(ip.get());
+		cr_assert_not_null(bram, "Couldn't find BRAM");
+
+		count++;
+
+		size_t len = 4 * (1 << 10);
+
+		/* Allocate memory to use with DMA */
+
+		auto bram0 = bram->getAllocator().allocate<char>(len);
+		auto bram1 = bram->getAllocator().allocate<char>(len);
+
+		gpu->makeAccessibleFromPCIeOrHostRam(bram0.getMemoryBlock());
+		gpu->makeAccessibleFromPCIeOrHostRam(bram1.getMemoryBlock());
+
+		auto hostRam0 = villas::HostRam::getAllocator().allocate<char>(len);
+		auto hostRam1 = villas::HostRam::getAllocator().allocate<char>(len);
+
+		gpu->makeAccessibleFromPCIeOrHostRam(hostRam0.getMemoryBlock());
+		gpu->makeAccessibleFromPCIeOrHostRam(hostRam1.getMemoryBlock());
+
+		auto dmaRam0 = villas::HostDmaRam::getAllocator().allocate<char>(len);
+		auto dmaRam1 = villas::HostDmaRam::getAllocator().allocate<char>(len);
+
+		gpu->makeAccessibleFromPCIeOrHostRam(dmaRam0.getMemoryBlock());
+		gpu->makeAccessibleFromPCIeOrHostRam(dmaRam1.getMemoryBlock());
+
+		auto gpuMem0 = gpu->getAllocator().allocate<char>(64 << 10);
+		auto gpuMem1 = gpu->getAllocator().allocate<char>(64 << 10);
+
+		gpu->makeAccessibleToPCIeAndVA(gpuMem0.getMemoryBlock());
+		gpu->makeAccessibleToPCIeAndVA(gpuMem1.getMemoryBlock());
+
+
+//		auto& src = bram0;
+//		auto& dst = bram1;
+
+//		auto& src = hostRam0;
+//		auto& dst = hostRam1;
+
+		auto& src = dmaRam0;
+//		auto& dst = dmaRam1;
+
+//		auto& src = gpuMem0;
+		auto& dst = gpuMem1;
+
+
+		std::list<std::pair<std::string, std::function<void()>>> memcpyFuncs = {
+		    {"cudaMemcpy", [&]() {gpu->memcpySync(src.getMemoryBlock(), dst.getMemoryBlock(), len);}},
+		    {"CUDA kernel", [&]() {gpu->memcpyKernel(src.getMemoryBlock(), dst.getMemoryBlock(), len);}},
+	    };
+
+		auto dmaIp = card->lookupIp(villas::fpga::Vlnv("xilinx.com:ip:axi_dma:"));
+		auto dma = dynamic_cast<villas::fpga::ip::Dma*>(dmaIp);
+
+		if(dma != nullptr and dma->connectLoopback()) {
+			memcpyFuncs.push_back({
+			    "DMA memcpy", [&]() {
+			        if(not dma->makeAccesibleFromVA(src.getMemoryBlock()) or
+			           not dma->makeAccesibleFromVA(dst.getMemoryBlock())) {
+			            return;
+			        }
+			        dma->memcpy(src.getMemoryBlock(), dst.getMemoryBlock(), len);
+			    }});
+		}
+
+		for(auto& [name, memcpyFunc] : memcpyFuncs) {
+			logger->info("Testing {}", name);
+
+			/* Get new random data */
+			const size_t lenRandom = read_random(&src, len);
+			cr_assert(len == lenRandom, "Failed to get random data");
+
+			memcpyFunc();
+			const bool success = memcmp(&src, &dst, len) == 0;
+
+			logger->info("  {}", success ?
+			                 TXT_GREEN("Passed") :
+			                 TXT_RED("Failed"));
+		}
+
+		villas::MemoryManager::get().dump();
+	}
+
+
+	cr_assert(count > 0, "No BRAM found");
+}
--- a/fpga/thirdparty/udmabuf
+++ b/fpga/thirdparty/udmabuf
@ -0,0 +1 @@
+Subproject commit 65762ca3333cb230e3b80ecf4cb5e605390474f2
				`@ -0,0 +1 @@`
				`Subproject commit 0441daa447b80260c4e11096f03e88f7be08bfa2`
				`@ -0,0 +1 @@`
				`Subproject commit 65762ca3333cb230e3b80ecf4cb5e605390474f2`