diff --git a/fpga/common b/fpga/common
index dd7d75d0a..9747c6ead 160000
--- a/fpga/common
+++ b/fpga/common
@@ -1 +1 @@
-Subproject commit dd7d75d0aab3801d65f9ff757d82f47f705514af
+Subproject commit 9747c6ead6dedff943dbf22ce74e40e9b2622514
diff --git a/fpga/etc/fpga.json b/fpga/etc/fpga.json
index 5f982e2a8..2beee4407 100644
--- a/fpga/etc/fpga.json
+++ b/fpga/etc/fpga.json
@@ -33,90 +33,6 @@
         "hier_0_axi_dma_axi_dma_0": {
           "vlnv": "xilinx.com:ip:axi_dma:7.1",
           "memory-view": {
-            "M_AXI_SG": {
-              "bram_0_axi_bram_ctrl_0": {
-                "Mem0": {
-                  "baseaddr": 0,
-                  "highaddr": 8191,
-                  "size": 8192
-                }
-              },
-              "hier_0_axi_dma_axi_dma_1": {
-                "Reg": {
-                  "baseaddr": 8192,
-                  "highaddr": 12287,
-                  "size": 4096
-                }
-              },
-              "hier_0_axi_dma_axi_dma_0": {
-                "Reg": {
-                  "baseaddr": 12288,
-                  "highaddr": 16383,
-                  "size": 4096
-                }
-              },
-              "timer_0_axi_timer_0": {
-                "Reg": {
-                  "baseaddr": 16384,
-                  "highaddr": 20479,
-                  "size": 4096
-                }
-              },
-              "hier_0_axis_interconnect_0_axis_interconnect_0_xbar": {
-                "Reg": {
-                  "baseaddr": 20480,
-                  "highaddr": 24575,
-                  "size": 4096
-                }
-              },
-              "hier_0_axi_fifo_mm_s_0": {
-                "Mem0": {
-                  "baseaddr": 24576,
-                  "highaddr": 28671,
-                  "size": 4096
-                },
-                "Mem1": {
-                  "baseaddr": 49152,
-                  "highaddr": 57343,
-                  "size": 8192
-                }
-              },
-              "pcie_0_axi_reset_0": {
-                "Reg": {
-                  "baseaddr": 28672,
-                  "highaddr": 32767,
-                  "size": 4096
-                }
-              },
-              "hier_0_rtds_axis_0": {
-                "reg0": {
-                  "baseaddr": 32768,
-                  "highaddr": 36863,
-                  "size": 4096
-                }
-              },
-              "hier_0_hls_dft_0": {
-                "Reg": {
-                  "baseaddr": 36864,
-                  "highaddr": 40959,
-                  "size": 4096
-                }
-              },
-              "pcie_0_axi_pcie_intc_0": {
-                "Reg": {
-                  "baseaddr": 45056,
-                  "highaddr": 49151,
-                  "size": 4096
-                }
-              },
-              "pcie_0_axi_pcie_0": {
-                "CTL0": {
-                  "baseaddr": 268435456,
-                  "highaddr": 536870911,
-                  "size": 268435456
-                }
-              }
-            },
             "M_AXI_MM2S": {
               "pcie_0_axi_pcie_0": {
                 "BAR0": {
@@ -134,17 +50,115 @@
                   "size": 4294967296
                 }
               }
+            },
+            "M_AXI_SG": {
+              "hier_0_axi_dma_axi_dma_0": {
+                "Reg": {
+                  "baseaddr": 4096,
+                  "highaddr": 8191,
+                  "size": 4096
+                }
+              },
+              "hier_0_axi_dma_axi_dma_1": {
+                "Reg": {
+                  "baseaddr": 8192,
+                  "highaddr": 12287,
+                  "size": 4096
+                }
+              },
+              "hier_0_axi_fifo_mm_s_0": {
+                "Mem0": {
+                  "baseaddr": 12288,
+                  "highaddr": 16383,
+                  "size": 4096
+                },
+                "Mem1": {
+                  "baseaddr": 16384,
+                  "highaddr": 24575,
+                  "size": 8192
+                }
+              },
+              "pcie_0_axi_pcie_intc_0": {
+                "Reg": {
+                  "baseaddr": 24576,
+                  "highaddr": 28671,
+                  "size": 4096
+                }
+              },
+              "pcie_0_axi_reset_0": {
+                "Reg": {
+                  "baseaddr": 28672,
+                  "highaddr": 32767,
+                  "size": 4096
+                }
+              },
+              "timer_0_axi_timer_0": {
+                "Reg": {
+                  "baseaddr": 32768,
+                  "highaddr": 36863,
+                  "size": 4096
+                }
+              },
+              "hier_0_hls_dft_0": {
+                "Reg": {
+                  "baseaddr": 36864,
+                  "highaddr": 40959,
+                  "size": 4096
+                }
+              },
+              "hier_0_rtds_axis_0": {
+                "reg0": {
+                  "baseaddr": 40960,
+                  "highaddr": 45055,
+                  "size": 4096
+                }
+              },
+              "hier_0_axis_interconnect_0_axis_interconnect_0_xbar": {
+                "Reg": {
+                  "baseaddr": 45056,
+                  "highaddr": 49151,
+                  "size": 4096
+                }
+              },
+              "bram_0_axi_bram_ctrl_0": {
+                "Mem0": {
+                  "baseaddr": 49152,
+                  "highaddr": 57343,
+                  "size": 8192
+                }
+              },
+              "hier_0_rtds2gpu_0": {
+                "Reg": {
+                  "baseaddr": 57344,
+                  "highaddr": 61439,
+                  "size": 4096
+                }
+              },
+              "hier_0_gpu2rtds_0": {
+                "Reg": {
+                  "baseaddr": 61440,
+                  "highaddr": 65535,
+                  "size": 4096
+                }
+              },
+              "pcie_0_axi_pcie_0": {
+                "CTL0": {
+                  "baseaddr": 268435456,
+                  "highaddr": 536870911,
+                  "size": 268435456
+                }
+              }
             }
           },
           "ports": [
             {
               "role": "master",
-              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:1",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:S01_AXIS",
               "name": "MM2S"
             },
             {
               "role": "slave",
-              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:1",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:M01_AXIS",
               "name": "S2MM"
             }
           ],
@@ -178,12 +192,12 @@
           "ports": [
             {
               "role": "master",
-              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:6",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:S06_AXIS",
               "name": "MM2S"
             },
             {
               "role": "slave",
-              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:6",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:M06_AXIS",
               "name": "S2MM"
             }
           ],
@@ -197,12 +211,12 @@
           "ports": [
             {
               "role": "master",
-              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:2",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:S02_AXIS",
               "name": "STR_TXD"
             },
             {
               "role": "slave",
-              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:2",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:M02_AXIS",
               "name": "STR_RXD"
             }
           ],
@@ -210,43 +224,143 @@
             "interrupt": "pcie_0_axi_pcie_intc_0:2"
           }
         },
+        "hier_0_axis_data_fifo_0": {
+          "vlnv": "xilinx.com:ip:axis_data_fifo:1.1",
+          "ports": [
+            {
+              "role": "master",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:S03_AXIS",
+              "name": "AXIS"
+            },
+            {
+              "role": "slave",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:M03_AXIS",
+              "name": "AXIS"
+            }
+          ]
+        },
+        "hier_0_axis_data_fifo_1": {
+          "vlnv": "xilinx.com:ip:axis_data_fifo:1.1",
+          "ports": [
+            {
+              "role": "master",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:S04_AXIS",
+              "name": "AXIS"
+            },
+            {
+              "role": "slave",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:M04_AXIS",
+              "name": "AXIS"
+            }
+          ]
+        },
         "hier_0_axis_interconnect_0_axis_interconnect_0_xbar": {
           "vlnv": "xilinx.com:ip:axis_switch:1.1",
           "ports": [
+            {
+              "role": "slave",
+              "target": "hier_0_rtds_axis_0:m_axis",
+              "name": "S00_AXIS"
+            },
             {
               "role": "master",
-              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:3",
-              "name": "M03_AXIS"
+              "target": "hier_0_rtds_axis_0:s_axis",
+              "name": "M00_AXIS"
             },
             {
               "role": "slave",
-              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:3",
+              "target": "hier_0_axi_dma_axi_dma_0:MM2S",
+              "name": "S01_AXIS"
+            },
+            {
+              "role": "master",
+              "target": "hier_0_axi_dma_axi_dma_0:S2MM",
+              "name": "M01_AXIS"
+            },
+            {
+              "role": "slave",
+              "target": "hier_0_axi_fifo_mm_s_0:STR_TXD",
+              "name": "S02_AXIS"
+            },
+            {
+              "role": "master",
+              "target": "hier_0_axi_fifo_mm_s_0:STR_RXD",
+              "name": "M02_AXIS"
+            },
+            {
+              "role": "slave",
+              "target": "hier_0_axis_data_fifo_0:AXIS",
               "name": "S03_AXIS"
             },
             {
               "role": "master",
-              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:4",
+              "target": "hier_0_axis_data_fifo_0:AXIS",
+              "name": "M03_AXIS"
+            },
+            {
+              "role": "slave",
+              "target": "hier_0_axis_data_fifo_1:AXIS",
+              "name": "S04_AXIS"
+            },
+            {
+              "role": "master",
+              "target": "hier_0_axis_data_fifo_1:AXIS",
               "name": "M04_AXIS"
             },
             {
               "role": "slave",
-              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:4",
-              "name": "S04_AXIS"
+              "target": "hier_0_hls_dft_0:output_r",
+              "name": "S05_AXIS"
+            },
+            {
+              "role": "master",
+              "target": "hier_0_hls_dft_0:input_r",
+              "name": "M05_AXIS"
+            },
+            {
+              "role": "slave",
+              "target": "hier_0_axi_dma_axi_dma_1:MM2S",
+              "name": "S06_AXIS"
+            },
+            {
+              "role": "master",
+              "target": "hier_0_axi_dma_axi_dma_1:S2MM",
+              "name": "M06_AXIS"
+            },
+            {
+              "role": "slave",
+              "target": "hier_0_gpu2rtds_0:rtds_output",
+              "name": "S07_AXIS"
+            },
+            {
+              "role": "master",
+              "target": "hier_0_rtds2gpu_0:rtds_input",
+              "name": "M07_AXIS"
             }
           ],
-          "num_ports": 7
+          "num_ports": 8
+        },
+        "hier_0_gpu2rtds_0": {
+          "vlnv": "acs.eonerc.rwth-aachen.de:hls:gpu2rtds:1.0",
+          "ports": [
+            {
+              "role": "master",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:S07_AXIS",
+              "name": "rtds_output"
+            }
+          ]
         },
         "hier_0_hls_dft_0": {
           "vlnv": "acs.eonerc.rwth-aachen.de:hls:hls_dft:1.1",
           "ports": [
             {
               "role": "master",
-              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:5",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:S05_AXIS",
               "name": "output_r"
             },
             {
               "role": "slave",
-              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:5",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:M05_AXIS",
               "name": "input_r"
             }
           ],
@@ -254,17 +368,38 @@
             "interrupt": "pcie_0_axi_pcie_intc_0:1"
           }
         },
+        "hier_0_rtds2gpu_0": {
+          "vlnv": "acs.eonerc.rwth-aachen.de:hls:rtds2gpu:1.1",
+          "memory-view": {
+            "m_axi_axi_mm": {
+              "pcie_0_axi_pcie_0": {
+                "BAR0": {
+                  "baseaddr": 0,
+                  "highaddr": 4294967295,
+                  "size": 4294967296
+                }
+              }
+            }
+          },
+          "ports": [
+            {
+              "role": "slave",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:M07_AXIS",
+              "name": "rtds_input"
+            }
+          ]
+        },
         "hier_0_rtds_axis_0": {
           "vlnv": "acs.eonerc.rwth-aachen.de:user:rtds_axis:1.0",
           "ports": [
             {
               "role": "master",
-              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:0",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:S00_AXIS",
               "name": "m_axis"
             },
             {
               "role": "slave",
-              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:0",
+              "target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:M00_AXIS",
               "name": "s_axis"
             }
           ],
@@ -278,11 +413,11 @@
           "vlnv": "xilinx.com:ip:axi_pcie:2.8",
           "memory-view": {
             "M_AXI": {
-              "bram_0_axi_bram_ctrl_0": {
-                "Mem0": {
-                  "baseaddr": 0,
+              "hier_0_axi_dma_axi_dma_0": {
+                "Reg": {
+                  "baseaddr": 4096,
                   "highaddr": 8191,
-                  "size": 8192
+                  "size": 4096
                 }
               },
               "hier_0_axi_dma_axi_dma_1": {
@@ -292,37 +427,23 @@
                   "size": 4096
                 }
               },
-              "hier_0_axi_dma_axi_dma_0": {
-                "Reg": {
+              "hier_0_axi_fifo_mm_s_0": {
+                "Mem0": {
                   "baseaddr": 12288,
                   "highaddr": 16383,
                   "size": 4096
-                }
-              },
-              "timer_0_axi_timer_0": {
-                "Reg": {
+                },
+                "Mem1": {
                   "baseaddr": 16384,
-                  "highaddr": 20479,
-                  "size": 4096
-                }
-              },
-              "hier_0_axis_interconnect_0_axis_interconnect_0_xbar": {
-                "Reg": {
-                  "baseaddr": 20480,
                   "highaddr": 24575,
-                  "size": 4096
+                  "size": 8192
                 }
               },
-              "hier_0_axi_fifo_mm_s_0": {
-                "Mem0": {
+              "pcie_0_axi_pcie_intc_0": {
+                "Reg": {
                   "baseaddr": 24576,
                   "highaddr": 28671,
                   "size": 4096
-                },
-                "Mem1": {
-                  "baseaddr": 49152,
-                  "highaddr": 57343,
-                  "size": 8192
                 }
               },
               "pcie_0_axi_reset_0": {
@@ -332,8 +453,8 @@
                   "size": 4096
                 }
               },
-              "hier_0_rtds_axis_0": {
-                "reg0": {
+              "timer_0_axi_timer_0": {
+                "Reg": {
                   "baseaddr": 32768,
                   "highaddr": 36863,
                   "size": 4096
@@ -346,13 +467,41 @@
                   "size": 4096
                 }
               },
-              "pcie_0_axi_pcie_intc_0": {
+              "hier_0_rtds_axis_0": {
+                "reg0": {
+                  "baseaddr": 40960,
+                  "highaddr": 45055,
+                  "size": 4096
+                }
+              },
+              "hier_0_axis_interconnect_0_axis_interconnect_0_xbar": {
                 "Reg": {
                   "baseaddr": 45056,
                   "highaddr": 49151,
                   "size": 4096
                 }
               },
+              "bram_0_axi_bram_ctrl_0": {
+                "Mem0": {
+                  "baseaddr": 49152,
+                  "highaddr": 57343,
+                  "size": 8192
+                }
+              },
+              "hier_0_rtds2gpu_0": {
+                "Reg": {
+                  "baseaddr": 57344,
+                  "highaddr": 61439,
+                  "size": 4096
+                }
+              },
+              "hier_0_gpu2rtds_0": {
+                "Reg": {
+                  "baseaddr": 61440,
+                  "highaddr": 65535,
+                  "size": 4096
+                }
+              },
               "pcie_0_axi_pcie_0": {
                 "CTL0": {
                   "baseaddr": 268435456,
diff --git a/fpga/include/villas/fpga/ip_node.hpp b/fpga/include/villas/fpga/ip_node.hpp
index 427917fc0..1ce9ded0c 100644
--- a/fpga/include/villas/fpga/ip_node.hpp
+++ b/fpga/include/villas/fpga/ip_node.hpp
@@ -71,8 +71,8 @@ public:
 	                        const std::string& port,
 	                        bool isMaster)
 	{
-		for(auto& [vertexId, vertex] : vertices) {
-			(void) vertexId;
+		for(auto& vertexEntry : vertices) {
+			auto& vertex = vertexEntry.second;
 			if(vertex->nodeName == node and vertex->portName == port and vertex->isMaster == isMaster)
 				return vertex;
 		}
@@ -86,7 +86,7 @@ public:
 };
 
 
-class IpNode : public IpCore {
+class IpNode : public virtual IpCore {
 public:
 
 	friend class IpNodeFactory;
diff --git a/fpga/include/villas/fpga/ips/dma.hpp b/fpga/include/villas/fpga/ips/dma.hpp
index 963c20cda..12cf0236a 100644
--- a/fpga/include/villas/fpga/ips/dma.hpp
+++ b/fpga/include/villas/fpga/ips/dma.hpp
@@ -43,7 +43,10 @@ public:
 	bool init();
 	bool reset();
 
+	// memory-mapped to stream (MM2S)
 	bool write(const MemoryBlock& mem, size_t len);
+
+	// stream to memory-mapped (S2MM)
 	bool read(const MemoryBlock& mem, size_t len);
 
 	size_t writeComplete()
diff --git a/fpga/include/villas/fpga/ips/gpu2rtds.hpp b/fpga/include/villas/fpga/ips/gpu2rtds.hpp
new file mode 100644
index 000000000..1aab1a2aa
--- /dev/null
+++ b/fpga/include/villas/fpga/ips/gpu2rtds.hpp
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <villas/memory.hpp>
+#include <villas/fpga/ip_node.hpp>
+#include <villas/fpga/ips/hls.hpp>
+
+#include <villas/fpga/ips/rtds2gpu/register_types.hpp>
+#include <villas/fpga/ips/rtds2gpu/xgpu2rtds_hw.h>
+
+namespace villas {
+namespace fpga {
+namespace ip {
+
+
+class Gpu2Rtds : public IpNode, public Hls
+{
+public:
+	friend class Gpu2RtdsFactory;
+
+	bool init();
+
+	void dump(spdlog::level::level_enum logLevel = spdlog::level::info);
+	bool startOnce(size_t frameSize);
+
+	size_t getMaxFrameSize();
+
+	const StreamVertex&
+	getDefaultMasterPort() const
+	{ return getMasterPort(rtdsOutputStreamPort); }
+
+	MemoryBlock
+	getRegisterMemory() const
+	{ return MemoryBlock(0, 1 << 10, getAddressSpaceId(registerMemory)); }
+
+private:
+	bool updateStatus();
+
+public:
+	static constexpr const char* rtdsOutputStreamPort = "rtds_output";
+
+	struct StatusControlRegister { uint32_t
+		status_ap_vld	: 1,
+		_res			: 31;
+	};
+
+	using StatusRegister = axilite_reg_status_t;
+
+	static constexpr uintptr_t registerStatusOffset = XGPU2RTDS_CTRL_ADDR_STATUS_DATA;
+	static constexpr uintptr_t registerStatusCtrlOffset = XGPU2RTDS_CTRL_ADDR_STATUS_CTRL;
+	static constexpr uintptr_t registerFrameSizeOffset = XGPU2RTDS_CTRL_ADDR_FRAME_SIZE_DATA;
+	static constexpr uintptr_t registerFrameOffset = XGPU2RTDS_CTRL_ADDR_FRAME_BASE;
+	static constexpr uintptr_t registerFrameLength = XGPU2RTDS_CTRL_DEPTH_FRAME;
+
+public:
+	StatusRegister* registerStatus;
+	StatusControlRegister* registerStatusCtrl;
+	uint32_t* registerFrameSize;
+	uint32_t* registerFrames;
+
+	size_t maxFrameSize;
+
+	bool started;
+};
+
+
+class Gpu2RtdsFactory : public IpNodeFactory {
+public:
+	Gpu2RtdsFactory();
+
+	IpCore* create()
+	{ return new Gpu2Rtds; }
+
+	std::string
+	getName() const
+	{ return "Gpu2Rtds"; }
+
+	std::string
+	getDescription() const
+	{ return "HLS Gpu2Rtds IP"; }
+
+	Vlnv getCompatibleVlnv() const
+	{ return {"acs.eonerc.rwth-aachen.de:hls:gpu2rtds:"}; }
+};
+
+} // namespace ip
+} // namespace fpga
+} // namespace villas
diff --git a/fpga/include/villas/fpga/ips/hls.hpp b/fpga/include/villas/fpga/ips/hls.hpp
new file mode 100644
index 000000000..1184fdfd9
--- /dev/null
+++ b/fpga/include/villas/fpga/ips/hls.hpp
@@ -0,0 +1,137 @@
+#pragma once
+
+#include <villas/memory.hpp>
+#include <villas/fpga/ip_node.hpp>
+
+namespace villas {
+namespace fpga {
+namespace ip {
+
+
+class Hls : public virtual IpCore
+{
+public:
+	virtual bool init()
+	{
+		auto& registers = addressTranslations.at(registerMemory);
+
+		controlRegister = reinterpret_cast<ControlRegister*>(registers.getLocalAddr(registerControlAddr));
+		globalIntRegister = reinterpret_cast<GlobalIntRegister*>(registers.getLocalAddr(registerGlobalIntEnableAddr));
+		ipIntEnableRegister = reinterpret_cast<IpIntRegister*>(registers.getLocalAddr(registerIntEnableAddr));
+		ipIntStatusRegister = reinterpret_cast<IpIntRegister*>(registers.getLocalAddr(registerIntStatusAddr));
+
+		setAutoRestart(false);
+		setGlobalInterrupt(false);
+
+		return true;
+	}
+
+	bool start()
+	{
+		controlRegister->ap_start = true;
+		running = true;
+
+		return true;
+	}
+
+	virtual bool isFinished()
+	{ updateRunningStatus(); return !running; }
+
+
+	bool isRunning()
+	{ updateRunningStatus(); return running; }
+
+
+	void setAutoRestart(bool enabled) const
+	{ controlRegister->auto_restart = enabled; }
+
+
+	void setGlobalInterrupt(bool enabled) const
+	{ globalIntRegister->globalInterruptEnable = enabled; }
+
+
+	void setReadyInterrupt(bool enabled) const
+	{ ipIntEnableRegister->ap_ready = enabled; }
+
+
+	void setDoneInterrupt(bool enabled) const
+	{ ipIntEnableRegister->ap_done = enabled; }
+
+
+	bool isIdleBit() const
+	{ return controlRegister->ap_idle; }
+
+
+	bool isReadyBit() const
+	{ return controlRegister->ap_ready; }
+
+
+	/// Warning: the corresponding bit is cleared on read of the register, so if
+	/// not used correctly, this function may never return true. Only use this
+	/// function if you really know what you are doing!
+	bool isDoneBit() const
+	{ return controlRegister->ap_done; }
+
+
+	bool isAutoRestartBit() const
+	{ return controlRegister->auto_restart; }
+
+private:
+	void updateRunningStatus()
+	{
+		if(running and isIdleBit())
+			running = false;
+	}
+
+protected:
+	/* Memory block handling */
+
+	static constexpr const char* registerMemory = "Reg";
+
+	virtual std::list<MemoryBlockName> getMemoryBlocks() const
+	{ return { registerMemory }; }
+
+
+public:
+	/* Register definitions */
+
+	static constexpr uintptr_t registerControlAddr			= 0x00;
+	static constexpr uintptr_t registerGlobalIntEnableAddr	= 0x04;
+	static constexpr uintptr_t registerIntEnableAddr		= 0x08;
+	static constexpr uintptr_t registerIntStatusAddr		= 0x0c;
+
+	union ControlRegister {
+		uint32_t value;
+		struct  { uint32_t
+			ap_start				: 1,
+			ap_done					: 1,
+			ap_idle					: 1,
+			ap_ready				: 1,
+			_res1					: 3,
+			auto_restart			: 1,
+			_res2					: 24;
+		};
+	};
+
+	struct GlobalIntRegister { uint32_t
+		globalInterruptEnable	: 1,
+		_res					: 31;
+	};
+
+	struct IpIntRegister { uint32_t
+		ap_done : 1,
+		ap_ready : 1,
+		_res : 30;
+	};
+protected:
+	ControlRegister*	controlRegister;
+	GlobalIntRegister*	globalIntRegister;
+	IpIntRegister*		ipIntEnableRegister;
+	IpIntRegister*		ipIntStatusRegister;
+
+	bool running;
+};
+
+} // namespace ip
+} // namespace fpga
+} // namespace villas
diff --git a/fpga/include/villas/fpga/ips/rtds.hpp b/fpga/include/villas/fpga/ips/rtds.hpp
index 35aeef133..4f6d789e0 100644
--- a/fpga/include/villas/fpga/ips/rtds.hpp
+++ b/fpga/include/villas/fpga/ips/rtds.hpp
@@ -44,6 +44,14 @@ public:
 	std::list<std::string> getMemoryBlocks() const
 	{ return { registerMemory }; }
 
+	const StreamVertex&
+	getDefaultSlavePort() const
+	{ return getSlavePort(slavePort); }
+
+	const StreamVertex&
+	getDefaultMasterPort() const
+	{ return getMasterPort(masterPort); }
+
 private:
 	static constexpr const char registerMemory[] = "reg0";
 	static constexpr const char* irqTs = "irq_ts";
diff --git a/fpga/include/villas/fpga/ips/rtds2gpu.hpp b/fpga/include/villas/fpga/ips/rtds2gpu.hpp
new file mode 100644
index 000000000..b956805c3
--- /dev/null
+++ b/fpga/include/villas/fpga/ips/rtds2gpu.hpp
@@ -0,0 +1,96 @@
+#pragma once
+
+#include <villas/memory.hpp>
+#include <villas/fpga/ip_node.hpp>
+#include <villas/fpga/ips/hls.hpp>
+
+#include "rtds2gpu/xrtds2gpu.h"
+#include "rtds2gpu/register_types.hpp"
+
+namespace villas {
+namespace fpga {
+namespace ip {
+
+union ControlRegister {
+	uint32_t value;
+	struct  { uint32_t
+		ap_start				: 1,
+		ap_done					: 1,
+		ap_idle					: 1,
+		ap_ready				: 1,
+		_res1					: 3,
+		auto_restart			: 1,
+		_res2					: 24;
+	};
+};
+
+
+class Rtds2Gpu : public IpNode, public Hls
+{
+public:
+	friend class Rtds2GpuFactory;
+
+	bool init();
+
+	void dump(spdlog::level::level_enum logLevel = spdlog::level::info);
+
+	bool startOnce(const MemoryBlock& mem, size_t frameSize, size_t dataOffset, size_t doorbellOffset);
+
+	size_t getMaxFrameSize();
+
+	void dumpDoorbell(uint32_t doorbellRegister) const;
+
+	bool doorbellIsValid(const uint32_t& doorbellRegister) const
+	{ return reinterpret_cast<const reg_doorbell_t&>(doorbellRegister).is_valid; }
+
+	void doorbellReset(uint32_t& doorbellRegister) const
+	{ doorbellRegister = 0; }
+
+	static constexpr const char* registerMemory = "Reg";
+
+	std::list<MemoryBlockName> getMemoryBlocks() const
+	{ return { registerMemory }; }
+
+
+	const StreamVertex&
+	getDefaultSlavePort() const
+	{ return getSlavePort(rtdsInputStreamPort); }
+
+private:
+	bool updateStatus();
+
+private:
+	static constexpr const char* axiInterface = "m_axi_axi_mm";
+	static constexpr const char* rtdsInputStreamPort = "rtds_input";
+
+	XRtds2gpu xInstance;
+
+	axilite_reg_status_t status;
+	size_t maxFrameSize;
+
+	bool started;
+};
+
+
+class Rtds2GpuFactory : public IpNodeFactory {
+public:
+	Rtds2GpuFactory();
+
+	IpCore* create()
+	{ return new Rtds2Gpu; }
+
+	std::string
+	getName() const
+	{ return "Rtds2Gpu"; }
+
+	std::string
+	getDescription() const
+	{ return "HLS RTDS2GPU IP"; }
+
+	Vlnv getCompatibleVlnv() const
+	{ return {"acs.eonerc.rwth-aachen.de:hls:rtds2gpu:"}; }
+};
+
+} // namespace ip
+} // namespace fpga
+} // namespace villas
diff --git a/fpga/include/villas/fpga/ips/rtds2gpu/register_types.hpp b/fpga/include/villas/fpga/ips/rtds2gpu/register_types.hpp
new file mode 100644
index 000000000..ce7328417
--- /dev/null
+++ b/fpga/include/villas/fpga/ips/rtds2gpu/register_types.hpp
@@ -0,0 +1,57 @@
+#ifndef REGISTER_TYPES_H
+#define REGISTER_TYPES_H
+
+#include <stdint.h>
+#include <cstddef>
+#include <cstdint>
+
+union axilite_reg_status_t {
+	uint32_t value;
+	struct {
+		uint32_t
+		last_seq_nr			: 16,
+		last_count			: 6,
+		max_frame_size		: 6,
+		invalid_frame_size	: 1,
+		frame_too_short		: 1,
+		frame_too_long		: 1,
+		is_running			: 1;
+	};
+};
+
+union reg_doorbell_t {
+	uint32_t value;
+	struct {
+		uint32_t
+		seq_nr			: 16,
+		count			: 6,
+		is_valid		: 1;
+	};
+
+	constexpr reg_doorbell_t() : value(0) {}
+};
+
+template<size_t N, typename T = uint32_t>
+struct Rtds2GpuMemoryBuffer {
+	// this type is only for memory interpretation, it makes no sense to create
+	// an instance so it's forbidden
+	Rtds2GpuMemoryBuffer() = delete;
+
+	// T can be a more complex type that wraps multiple values
+	static constexpr size_t rawValueCount = N * (sizeof(T) / 4);
+
+	// As of C++14, offsetof() is not working for non-standard layout types (i.e.
+	// composed of non-POD members). This might work in C++17 though.
+	// More info: https://gist.github.com/graphitemaster/494f21190bb2c63c5516
+	//static constexpr size_t doorbellOffset = offsetof(Rtds2GpuMemoryBuffer, doorbell);
+	//static constexpr size_t dataOffset = offsetof(Rtds2GpuMemoryBuffer, data);
+
+	// HACK: This might break horribly, let's just hope C++17 will be there soon
+	static constexpr size_t dataOffset = 0;
+	static constexpr size_t doorbellOffset = N * sizeof(Rtds2GpuMemoryBuffer::data);
+
+	T data[N];
+	reg_doorbell_t doorbell;
+};
+
+#endif // REGISTER_TYPES_H
diff --git a/fpga/include/villas/fpga/ips/rtds2gpu/xgpu2rtds_hw.h b/fpga/include/villas/fpga/ips/rtds2gpu/xgpu2rtds_hw.h
new file mode 100644
index 000000000..8ea61f0f1
--- /dev/null
+++ b/fpga/include/villas/fpga/ips/rtds2gpu/xgpu2rtds_hw.h
@@ -0,0 +1,53 @@
+// ==============================================================
+// File generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC
+// Version: 2017.3
+// Copyright (C) 1986-2017 Xilinx, Inc. All Rights Reserved.
+// 
+// ==============================================================
+
+// CTRL
+// 0x00 : Control signals
+//        bit 0  - ap_start (Read/Write/COH)
+//        bit 1  - ap_done (Read/COR)
+//        bit 2  - ap_idle (Read)
+//        bit 3  - ap_ready (Read)
+//        bit 7  - auto_restart (Read/Write)
+//        others - reserved
+// 0x04 : Global Interrupt Enable Register
+//        bit 0  - Global Interrupt Enable (Read/Write)
+//        others - reserved
+// 0x08 : IP Interrupt Enable Register (Read/Write)
+//        bit 0  - Channel 0 (ap_done)
+//        bit 1  - Channel 1 (ap_ready)
+//        others - reserved
+// 0x0c : IP Interrupt Status Register (Read/TOW)
+//        bit 0  - Channel 0 (ap_done)
+//        bit 1  - Channel 1 (ap_ready)
+//        others - reserved
+// 0x10 : Data signal of frame_size
+//        bit 31~0 - frame_size[31:0] (Read/Write)
+// 0x14 : reserved
+// 0x80 : Data signal of status
+//        bit 31~0 - status[31:0] (Read)
+// 0x84 : Control signal of status
+//        bit 0  - status_ap_vld (Read/COR)
+//        others - reserved
+// 0x40 ~
+// 0x7f : Memory 'frame' (16 * 32b)
+//        Word n : bit [31:0] - frame[n]
+// (SC = Self Clear, COR = Clear on Read, TOW = Toggle on Write, COH = Clear on Handshake)
+
+#define XGPU2RTDS_CTRL_ADDR_AP_CTRL         0x00
+#define XGPU2RTDS_CTRL_ADDR_GIE             0x04
+#define XGPU2RTDS_CTRL_ADDR_IER             0x08
+#define XGPU2RTDS_CTRL_ADDR_ISR             0x0c
+#define XGPU2RTDS_CTRL_ADDR_FRAME_SIZE_DATA 0x10
+#define XGPU2RTDS_CTRL_BITS_FRAME_SIZE_DATA 32
+#define XGPU2RTDS_CTRL_ADDR_STATUS_DATA     0x80
+#define XGPU2RTDS_CTRL_BITS_STATUS_DATA     32
+#define XGPU2RTDS_CTRL_ADDR_STATUS_CTRL     0x84
+#define XGPU2RTDS_CTRL_ADDR_FRAME_BASE      0x40
+#define XGPU2RTDS_CTRL_ADDR_FRAME_HIGH      0x7f
+#define XGPU2RTDS_CTRL_WIDTH_FRAME          32
+#define XGPU2RTDS_CTRL_DEPTH_FRAME          16
+
diff --git a/fpga/include/villas/fpga/ips/rtds2gpu/xrtds2gpu.h b/fpga/include/villas/fpga/ips/rtds2gpu/xrtds2gpu.h
new file mode 100644
index 000000000..87cb2b70a
--- /dev/null
+++ b/fpga/include/villas/fpga/ips/rtds2gpu/xrtds2gpu.h
@@ -0,0 +1,113 @@
+// ==============================================================
+// File generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC
+// Version: 2017.3
+// Copyright (C) 1986-2017 Xilinx, Inc. All Rights Reserved.
+// 
+// ==============================================================
+
+#ifndef XRTDS2GPU_H
+#define XRTDS2GPU_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************** Include Files *********************************/
+#ifndef __linux__
+#include "xil_types.h"
+#include "xil_assert.h"
+#include "xstatus.h"
+#include "xil_io.h"
+#else
+#include <stdint.h>
+#include <assert.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <stddef.h>
+#endif
+#include "xrtds2gpu_hw.h"
+
+/**************************** Type Definitions ******************************/
+#ifdef __linux__
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+#else
+typedef struct {
+    u16 DeviceId;
+    u32 Ctrl_BaseAddress;
+} XRtds2gpu_Config;
+#endif
+
+typedef struct {
+    u32 Ctrl_BaseAddress;
+    u32 IsReady;
+} XRtds2gpu;
+
+/***************** Macros (Inline Functions) Definitions *********************/
+#ifndef __linux__
+#define XRtds2gpu_WriteReg(BaseAddress, RegOffset, Data) \
+    Xil_Out32((BaseAddress) + (RegOffset), (u32)(Data))
+#define XRtds2gpu_ReadReg(BaseAddress, RegOffset) \
+    Xil_In32((BaseAddress) + (RegOffset))
+#else
+#define XRtds2gpu_WriteReg(BaseAddress, RegOffset, Data) \
+    *(volatile u32*)((BaseAddress) + (RegOffset)) = (u32)(Data)
+#define XRtds2gpu_ReadReg(BaseAddress, RegOffset) \
+    *(volatile u32*)((BaseAddress) + (RegOffset))
+
+#define Xil_AssertVoid(expr)    assert(expr)
+#define Xil_AssertNonvoid(expr) assert(expr)
+
+#define XST_SUCCESS             0
+#define XST_DEVICE_NOT_FOUND    2
+#define XST_OPEN_DEVICE_FAILED  3
+#define XIL_COMPONENT_IS_READY  1
+#endif
+
+/************************** Function Prototypes *****************************/
+#ifndef __linux__
+int XRtds2gpu_Initialize(XRtds2gpu *InstancePtr, u16 DeviceId);
+XRtds2gpu_Config* XRtds2gpu_LookupConfig(u16 DeviceId);
+int XRtds2gpu_CfgInitialize(XRtds2gpu *InstancePtr, XRtds2gpu_Config *ConfigPtr);
+#else
+int XRtds2gpu_Initialize(XRtds2gpu *InstancePtr, const char* InstanceName);
+int XRtds2gpu_Release(XRtds2gpu *InstancePtr);
+#endif
+
+void XRtds2gpu_Start(XRtds2gpu *InstancePtr);
+u32 XRtds2gpu_IsDone(XRtds2gpu *InstancePtr);
+u32 XRtds2gpu_IsIdle(XRtds2gpu *InstancePtr);
+u32 XRtds2gpu_IsReady(XRtds2gpu *InstancePtr);
+void XRtds2gpu_EnableAutoRestart(XRtds2gpu *InstancePtr);
+void XRtds2gpu_DisableAutoRestart(XRtds2gpu *InstancePtr);
+
+void XRtds2gpu_Set_baseaddr(XRtds2gpu *InstancePtr, u32 Data);
+u32 XRtds2gpu_Get_baseaddr(XRtds2gpu *InstancePtr);
+void XRtds2gpu_Set_data_offset(XRtds2gpu *InstancePtr, u32 Data);
+u32 XRtds2gpu_Get_data_offset(XRtds2gpu *InstancePtr);
+void XRtds2gpu_Set_doorbell_offset(XRtds2gpu *InstancePtr, u32 Data);
+u32 XRtds2gpu_Get_doorbell_offset(XRtds2gpu *InstancePtr);
+void XRtds2gpu_Set_frame_size(XRtds2gpu *InstancePtr, u32 Data);
+u32 XRtds2gpu_Get_frame_size(XRtds2gpu *InstancePtr);
+u32 XRtds2gpu_Get_status(XRtds2gpu *InstancePtr);
+u32 XRtds2gpu_Get_status_vld(XRtds2gpu *InstancePtr);
+
+void XRtds2gpu_InterruptGlobalEnable(XRtds2gpu *InstancePtr);
+void XRtds2gpu_InterruptGlobalDisable(XRtds2gpu *InstancePtr);
+void XRtds2gpu_InterruptEnable(XRtds2gpu *InstancePtr, u32 Mask);
+void XRtds2gpu_InterruptDisable(XRtds2gpu *InstancePtr, u32 Mask);
+void XRtds2gpu_InterruptClear(XRtds2gpu *InstancePtr, u32 Mask);
+u32 XRtds2gpu_InterruptGetEnabled(XRtds2gpu *InstancePtr);
+u32 XRtds2gpu_InterruptGetStatus(XRtds2gpu *InstancePtr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/fpga/include/villas/fpga/ips/rtds2gpu/xrtds2gpu_hw.h b/fpga/include/villas/fpga/ips/rtds2gpu/xrtds2gpu_hw.h
new file mode 100644
index 000000000..73bf1bce2
--- /dev/null
+++ b/fpga/include/villas/fpga/ips/rtds2gpu/xrtds2gpu_hw.h
@@ -0,0 +1,61 @@
+// ==============================================================
+// File generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC
+// Version: 2017.3
+// Copyright (C) 1986-2017 Xilinx, Inc. All Rights Reserved.
+// 
+// ==============================================================
+
+// CTRL
+// 0x00 : Control signals
+//        bit 0  - ap_start (Read/Write/COH)
+//        bit 1  - ap_done (Read/COR)
+//        bit 2  - ap_idle (Read)
+//        bit 3  - ap_ready (Read)
+//        bit 7  - auto_restart (Read/Write)
+//        others - reserved
+// 0x04 : Global Interrupt Enable Register
+//        bit 0  - Global Interrupt Enable (Read/Write)
+//        others - reserved
+// 0x08 : IP Interrupt Enable Register (Read/Write)
+//        bit 0  - Channel 0 (ap_done)
+//        bit 1  - Channel 1 (ap_ready)
+//        others - reserved
+// 0x0c : IP Interrupt Status Register (Read/TOW)
+//        bit 0  - Channel 0 (ap_done)
+//        bit 1  - Channel 1 (ap_ready)
+//        others - reserved
+// 0x10 : Data signal of baseaddr
+//        bit 31~0 - baseaddr[31:0] (Read/Write)
+// 0x14 : reserved
+// 0x18 : Data signal of data_offset
+//        bit 31~0 - data_offset[31:0] (Read/Write)
+// 0x1c : reserved
+// 0x20 : Data signal of doorbell_offset
+//        bit 31~0 - doorbell_offset[31:0] (Read/Write)
+// 0x24 : reserved
+// 0x28 : Data signal of frame_size
+//        bit 31~0 - frame_size[31:0] (Read/Write)
+// 0x2c : reserved
+// 0x30 : Data signal of status
+//        bit 31~0 - status[31:0] (Read)
+// 0x34 : Control signal of status
+//        bit 0  - status_ap_vld (Read/COR)
+//        others - reserved
+// (SC = Self Clear, COR = Clear on Read, TOW = Toggle on Write, COH = Clear on Handshake)
+
+#define XRTDS2GPU_CTRL_ADDR_AP_CTRL              0x00
+#define XRTDS2GPU_CTRL_ADDR_GIE                  0x04
+#define XRTDS2GPU_CTRL_ADDR_IER                  0x08
+#define XRTDS2GPU_CTRL_ADDR_ISR                  0x0c
+#define XRTDS2GPU_CTRL_ADDR_BASEADDR_DATA        0x10
+#define XRTDS2GPU_CTRL_BITS_BASEADDR_DATA        32
+#define XRTDS2GPU_CTRL_ADDR_DATA_OFFSET_DATA     0x18
+#define XRTDS2GPU_CTRL_BITS_DATA_OFFSET_DATA     32
+#define XRTDS2GPU_CTRL_ADDR_DOORBELL_OFFSET_DATA 0x20
+#define XRTDS2GPU_CTRL_BITS_DOORBELL_OFFSET_DATA 32
+#define XRTDS2GPU_CTRL_ADDR_FRAME_SIZE_DATA      0x28
+#define XRTDS2GPU_CTRL_BITS_FRAME_SIZE_DATA      32
+#define XRTDS2GPU_CTRL_ADDR_STATUS_DATA          0x30
+#define XRTDS2GPU_CTRL_BITS_STATUS_DATA          32
+#define XRTDS2GPU_CTRL_ADDR_STATUS_CTRL          0x34
+
diff --git a/fpga/lib/CMakeLists.txt b/fpga/lib/CMakeLists.txt
index 4289c5276..b4d3f34b4 100644
--- a/fpga/lib/CMakeLists.txt
+++ b/fpga/lib/CMakeLists.txt
@@ -34,8 +34,16 @@ set(SOURCES
 	ips/dma.cpp
 	ips/bram.cpp
 	ips/rtds.cpp
+
+	ips/rtds2gpu/rtds2gpu.cpp
+	ips/rtds2gpu/xrtds2gpu.c
+	ips/rtds2gpu/gpu2rtds.cpp
 )
 
+# we don't have much influence on drivers generated by Xilinx, so ignore warnings
+set_source_files_properties(ips/rtds2gpu/xrtds2gpu.c
+	PROPERTIES COMPILE_FLAGS -Wno-int-to-pointer-cast)
+
 include(FindPkgConfig)
 
 pkg_check_modules(JANSSON jansson)
diff --git a/fpga/lib/gpu/gpu.cpp b/fpga/lib/gpu/gpu.cpp
index fbefd5e36..e45efa7b2 100644
--- a/fpga/lib/gpu/gpu.cpp
+++ b/fpga/lib/gpu/gpu.cpp
@@ -374,6 +374,13 @@ void Gpu::memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t si
 	cudaDeviceSynchronize();
 }
 
+MemoryTranslation
+Gpu::translate(const MemoryBlock& dst)
+{
+	auto& mm = MemoryManager::get();
+	return mm.getTranslation(masterPciEAddrSpaceId, dst.getAddrSpaceId());
+}
+
 
 std::unique_ptr<villas::MemoryBlock, villas::MemoryBlock::deallocator_fn>
 GpuAllocator::allocateBlock(size_t size)
@@ -381,29 +388,53 @@ GpuAllocator::allocateBlock(size_t size)
 	cudaSetDevice(gpu.gpuId);
 
 	void* addr;
-	if(cudaSuccess != cudaMalloc(&addr, size)) {
-		logger->error("cudaMalloc(..., size={}) failed", size);
-		throw std::bad_alloc();
-	}
-
 	auto& mm = MemoryManager::get();
 
-	// assemble name for this block
-	std::stringstream name;
-	name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
+	// search for an existing chunk that has enough free memory
+	auto chunk = std::find_if(chunks.begin(), chunks.end(), [&](const auto& chunk) {
+		return chunk->getAvailableMemory() >= size;
+	});
 
-	auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
-	auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
 
-	const auto localAddr = reinterpret_cast<uintptr_t>(addr);
-	std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
-	        mem(new MemoryBlock(localAddr, size, blockAddrSpaceId), this->free);
+	if(chunk != chunks.end()) {
+		logger->debug("Found existing chunk that can host the requested block");
 
-	insertMemoryBlock(*mem);
+		return (*chunk)->allocateBlock(size);
 
-	gpu.makeAccessibleToPCIeAndVA(*mem);
+	} else {
+		// allocate a new chunk
 
-	return mem;
+		// rounded-up multiple of GPU page size
+		const size_t chunkSize = size - (size & (GpuPageSize - 1)) + GpuPageSize;
+		logger->debug("Allocate new chunk of {:#x} bytes", chunkSize);
+
+		if(cudaSuccess != cudaMalloc(&addr, chunkSize)) {
+			logger->error("cudaMalloc(..., size={}) failed", chunkSize);
+			throw std::bad_alloc();
+		}
+
+		// assemble name for this block
+		std::stringstream name;
+		name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
+
+		auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
+		auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
+
+		const auto localAddr = reinterpret_cast<uintptr_t>(addr);
+		std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
+		        mem(new MemoryBlock(localAddr, chunkSize, blockAddrSpaceId), this->free);
+
+		insertMemoryBlock(*mem);
+
+		// already make accessible to CPU
+		gpu.makeAccessibleToPCIeAndVA(*mem);
+
+		// create a new allocator to manage the chunk and push to chunk list
+		chunks.push_front(std::make_unique<LinearAllocator>(std::move(mem)));
+
+		// call again, this time there's a large enough chunk
+		return allocateBlock(size);
+	}
 }
 
 
diff --git a/fpga/lib/gpu/include/villas/gpu.hpp b/fpga/lib/gpu/include/villas/gpu.hpp
index 6e8dc0a6d..6e130bb30 100644
--- a/fpga/lib/gpu/include/villas/gpu.hpp
+++ b/fpga/lib/gpu/include/villas/gpu.hpp
@@ -58,6 +58,9 @@ public:
 
 	void memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size);
 
+	MemoryTranslation
+	translate(const MemoryBlock& dst);
+
 private:
 	bool registerIoMemory(const MemoryBlock& mem);
 	bool registerHostMemory(const MemoryBlock& mem);
@@ -81,6 +84,8 @@ private:
 
 class GpuAllocator : public BaseAllocator<GpuAllocator> {
 public:
+	static constexpr size_t GpuPageSize = 64UL << 10;
+
 	GpuAllocator(Gpu& gpu);
 
 	std::string getName() const;
@@ -90,6 +95,8 @@ public:
 
 private:
 	Gpu& gpu;
+	// TODO: replace by multimap (key is available memory)
+	std::list<std::unique_ptr<LinearAllocator>> chunks;
 };
 
 class GpuFactory : public Plugin {
diff --git a/fpga/lib/ip_node.cpp b/fpga/lib/ip_node.cpp
index c1dc36a5b..1a9ce5a48 100644
--- a/fpga/lib/ip_node.cpp
+++ b/fpga/lib/ip_node.cpp
@@ -41,7 +41,7 @@ IpNode::streamGraph;
 bool
 IpNodeFactory::configureJson(IpCore& ip, json_t* json_ip)
 {
-	auto& ipNode = reinterpret_cast<IpNode&>(ip);
+	auto& ipNode = dynamic_cast<IpNode&>(ip);
 	auto logger = getLogger();
 
 	json_t* json_ports = json_object_get(json_ip, "ports");
@@ -216,7 +216,7 @@ IpNode::connectLoopback()
 	logger->debug("switch at: {}", portMaster->nodeName);
 
 	// TODO: verify this is really a switch!
-	auto axiStreamSwitch = reinterpret_cast<ip::AxiStreamSwitch*>(
+	auto axiStreamSwitch = dynamic_cast<ip::AxiStreamSwitch*>(
 	                            card->lookupIp(portMaster->nodeName));
 
 	if(axiStreamSwitch == nullptr) {
diff --git a/fpga/lib/ips/bram.cpp b/fpga/lib/ips/bram.cpp
index 18a5ea0e2..fc881ff00 100644
--- a/fpga/lib/ips/bram.cpp
+++ b/fpga/lib/ips/bram.cpp
@@ -31,7 +31,7 @@ static BramFactory factory;
 bool
 BramFactory::configureJson(IpCore& ip, json_t* json_ip)
 {
-	auto& bram = reinterpret_cast<Bram&>(ip);
+	auto& bram = dynamic_cast<Bram&>(ip);
 
 	if(json_unpack(json_ip, "{ s: i }", "size", &bram.size) != 0) {
 		getLogger()->error("Cannot parse 'size'");
diff --git a/fpga/lib/ips/dma.cpp b/fpga/lib/ips/dma.cpp
index 3d8f9a3b6..7408461df 100644
--- a/fpga/lib/ips/dma.cpp
+++ b/fpga/lib/ips/dma.cpp
@@ -165,7 +165,7 @@ Dma::write(const MemoryBlock& mem, size_t len)
 	                                     mem.getAddrSpaceId());
 	const void* buf = reinterpret_cast<void*>(translation.getLocalAddr(0));
 
-	logger->debug("Write to address: {:p}", buf);
+	logger->debug("Write to stream from address {:p}", buf);
 	return hasScatterGather() ? writeSG(buf, len) : writeSimple(buf, len);
 }
 
@@ -180,7 +180,7 @@ Dma::read(const MemoryBlock& mem, size_t len)
 	                                     mem.getAddrSpaceId());
 	void* buf = reinterpret_cast<void*>(translation.getLocalAddr(0));
 
-	logger->debug("Read from address: {:p}", buf);
+	logger->debug("Read from stream and write to address {:p}", buf);
 	return hasScatterGather() ? readSG(buf, len) : readSimple(buf, len);
 }
 
diff --git a/fpga/lib/ips/pcie.cpp b/fpga/lib/ips/pcie.cpp
index d8294469e..ea5675b7d 100644
--- a/fpga/lib/ips/pcie.cpp
+++ b/fpga/lib/ips/pcie.cpp
@@ -125,7 +125,7 @@ bool
 AxiPciExpressBridgeFactory::configureJson(IpCore& ip, json_t* json_ip)
 {
 	auto logger = getLogger();
-	auto& pcie = reinterpret_cast<AxiPciExpressBridge&>(ip);
+	auto& pcie = dynamic_cast<AxiPciExpressBridge&>(ip);
 
 	for(auto barType : std::list<std::string>{"axi_bars", "pcie_bars"}) {
 		json_t* json_bars = json_object_get(json_ip, barType.c_str());
diff --git a/fpga/lib/ips/rtds2gpu/gpu2rtds.cpp b/fpga/lib/ips/rtds2gpu/gpu2rtds.cpp
new file mode 100644
index 000000000..563d9da63
--- /dev/null
+++ b/fpga/lib/ips/rtds2gpu/gpu2rtds.cpp
@@ -0,0 +1,142 @@
+#include <unistd.h>
+#include <cstring>
+
+#include <villas/memory_manager.hpp>
+#include <villas/fpga/ips/gpu2rtds.hpp>
+
+#include "log.hpp"
+
+namespace villas {
+namespace fpga {
+namespace ip {
+
+static Gpu2RtdsFactory factory;
+
+bool Gpu2Rtds::init()
+{
+	Hls::init();
+
+	auto& registers = addressTranslations.at(registerMemory);
+
+	registerStatus = reinterpret_cast<StatusRegister*>(registers.getLocalAddr(registerStatusOffset));
+	registerStatusCtrl = reinterpret_cast<StatusControlRegister*>(registers.getLocalAddr(registerStatusCtrlOffset));
+	registerFrameSize = reinterpret_cast<uint32_t*>(registers.getLocalAddr(registerFrameSizeOffset));
+	registerFrames = reinterpret_cast<uint32_t*>(registers.getLocalAddr(registerFrameOffset));
+
+	maxFrameSize = getMaxFrameSize();
+	logger->info("Max. frame size supported: {}", maxFrameSize);
+
+	return true;
+}
+
+bool
+Gpu2Rtds::startOnce(size_t frameSize)
+{
+	*registerFrameSize = frameSize;
+
+	start();
+
+	return true;
+}
+
+void Gpu2Rtds::dump(spdlog::level::level_enum logLevel)
+{
+	const auto frame_size = *registerFrameSize;
+	auto status = *registerStatus;
+
+	logger->log(logLevel, "Gpu2Rtds registers:");
+	logger->log(logLevel, "  Frame size (words):       {:#x}", frame_size);
+	logger->log(logLevel, "  Status:                   {:#x}", status.value);
+	logger->log(logLevel, "    Running:            {}", (status.is_running ? "yes" : "no"));
+	logger->log(logLevel, "    Frame too short:    {}", (status.frame_too_short ? "yes" : "no"));
+	logger->log(logLevel, "    Frame too long:     {}", (status.frame_too_long ? "yes" : "no"));
+	logger->log(logLevel, "    Frame size invalid: {}", (status.invalid_frame_size ? "yes" : "no"));
+	logger->log(logLevel, "    Last count:         {}", status.last_count);
+	logger->log(logLevel, "    Last seq. number:   {}", status.last_seq_nr);
+	logger->log(logLevel, "    Max. frame size:    {}", status.max_frame_size);
+}
+
+//bool Gpu2Rtds::startOnce(const MemoryBlock& mem, size_t frameSize, size_t dataOffset, size_t doorbellOffset)
+//{
+//	auto& mm = MemoryManager::get();
+
+//	if(frameSize > maxFrameSize) {
+//		logger->error("Requested frame size of {} exceeds max. frame size of {}",
+//		              frameSize, maxFrameSize);
+//		return false;
+//	}
+
+//	auto translationFromIp = mm.getTranslation(
+//	                               getMasterAddrSpaceByInterface(axiInterface),
+//	                               mem.getAddrSpaceId());
+
+//	// set address of memory block in HLS IP
+//	XGpu2Rtds_Set_baseaddr(&xInstance, translationFromIp.getLocalAddr(0));
+
+//	XGpu2Rtds_Set_doorbell_offset(&xInstance, doorbellOffset);
+//	XGpu2Rtds_Set_data_offset(&xInstance, dataOffset);
+//	XGpu2Rtds_Set_frame_size(&xInstance, frameSize);
+
+//	// prepare memory with all zeroes
+//	auto translationFromProcess = mm.getTranslationFromProcess(mem.getAddrSpaceId());
+//	auto memory = reinterpret_cast<void*>(translationFromProcess.getLocalAddr(0));
+//	memset(memory, 0, mem.getSize());
+
+//	// start IP
+//	return start();
+//}
+
+
+
+
+
+//bool
+//Gpu2Rtds::updateStatus()
+//{
+//	if(not XGpu2Rtds_Get_status_vld(&xInstance))
+//		return false;
+
+//	status.value = XGpu2Rtds_Get_status(&xInstance);
+
+//	return true;
+//}
+
+size_t
+Gpu2Rtds::getMaxFrameSize()
+{
+	*registerFrameSize = 0;
+
+	start();
+	while(not isFinished());
+
+	while(not registerStatusCtrl->status_ap_vld);
+
+	axilite_reg_status_t status = *registerStatus;
+
+//	logger->debug("(*registerStatus).max_frame_size: {}", (*registerStatus).max_frame_size);
+//	logger->debug("status.max_frame_size: {}", status.max_frame_size);
+
+//	assert(status.max_frame_size == (*registerStatus).max_frame_size);
+
+	return status.max_frame_size;
+}
+
+//void
+//Gpu2Rtds::dumpDoorbell(uint32_t doorbellRegister) const
+//{
+//	auto& doorbell = reinterpret_cast<reg_doorbell_t&>(doorbellRegister);
+
+//	logger->info("Doorbell register: {:#08x}", doorbell.value);
+//	logger->info("  Valid:       {}", (doorbell.is_valid ? "yes" : "no"));
+//	logger->info("  Count:       {}", doorbell.count);
+//	logger->info("  Seq. number: {}", doorbell.seq_nr);
+//}
+
+Gpu2RtdsFactory::Gpu2RtdsFactory() :
+    IpNodeFactory(getName())
+{
+}
+
+} // namespace ip
+} // namespace fpga
+} // namespace villas
diff --git a/fpga/lib/ips/rtds2gpu/rtds2gpu.cpp b/fpga/lib/ips/rtds2gpu/rtds2gpu.cpp
new file mode 100644
index 000000000..e89574efa
--- /dev/null
+++ b/fpga/lib/ips/rtds2gpu/rtds2gpu.cpp
@@ -0,0 +1,131 @@
+#include <unistd.h>
+#include <cstring>
+
+#include <villas/memory_manager.hpp>
+#include <villas/fpga/ips/rtds2gpu.hpp>
+
+#include "log.hpp"
+
+namespace villas {
+namespace fpga {
+namespace ip {
+
+static Rtds2GpuFactory factory;
+
+bool Rtds2Gpu::init()
+{
+	Hls::init();
+
+	xInstance.IsReady = XIL_COMPONENT_IS_READY;
+	xInstance.Ctrl_BaseAddress = getBaseAddr(registerMemory);
+
+	status.value = 0;
+	started = false;
+
+//	maxFrameSize = getMaxFrameSize();
+	maxFrameSize = 16;
+	logger->info("Max. frame size supported: {}", maxFrameSize);
+
+	return true;
+}
+
+
+
+void Rtds2Gpu::dump(spdlog::level::level_enum logLevel)
+{
+	const auto baseaddr = XRtds2gpu_Get_baseaddr(&xInstance);
+	const auto data_offset = XRtds2gpu_Get_data_offset(&xInstance);
+	const auto doorbell_offset = XRtds2gpu_Get_doorbell_offset(&xInstance);
+	const auto frame_size = XRtds2gpu_Get_frame_size(&xInstance);
+
+	logger->log(logLevel, "Rtds2Gpu registers (IP base {:#x}):", xInstance.Ctrl_BaseAddress);
+	logger->log(logLevel, "  Base address (bytes):     {:#x}", baseaddr);
+	logger->log(logLevel, "  Doorbell offset (bytes):  {:#x}", doorbell_offset);
+	logger->log(logLevel, "  Data offset (bytes):      {:#x}", data_offset);
+	logger->log(logLevel, "  Frame size (words):       {:#x}", frame_size);
+	logger->log(logLevel, "  Status:                   {:#x}", status.value);
+	logger->log(logLevel, "    Running:            {}", (status.is_running ? "yes" : "no"));
+	logger->log(logLevel, "    Frame too short:    {}", (status.frame_too_short ? "yes" : "no"));
+	logger->log(logLevel, "    Frame too long:     {}", (status.frame_too_long ? "yes" : "no"));
+	logger->log(logLevel, "    Frame size invalid: {}", (status.invalid_frame_size ? "yes" : "no"));
+	logger->log(logLevel, "    Last count:         {}", status.last_count);
+	logger->log(logLevel, "    Last seq. number:   {}", status.last_seq_nr);
+	logger->log(logLevel, "    Max. frame size:    {}", status.max_frame_size);
+}
+
+bool Rtds2Gpu::startOnce(const MemoryBlock& mem, size_t frameSize, size_t dataOffset, size_t doorbellOffset)
+{
+	auto& mm = MemoryManager::get();
+
+	if(frameSize > maxFrameSize) {
+		logger->error("Requested frame size of {} exceeds max. frame size of {}",
+		              frameSize, maxFrameSize);
+		return false;
+	}
+
+	auto translationFromIp = mm.getTranslation(
+	                               getMasterAddrSpaceByInterface(axiInterface),
+	                               mem.getAddrSpaceId());
+
+	// set address of memory block in HLS IP
+	XRtds2gpu_Set_baseaddr(&xInstance, translationFromIp.getLocalAddr(0));
+
+	XRtds2gpu_Set_doorbell_offset(&xInstance, doorbellOffset);
+	XRtds2gpu_Set_data_offset(&xInstance, dataOffset);
+	XRtds2gpu_Set_frame_size(&xInstance, frameSize);
+
+	// prepare memory with all zeroes
+	auto translationFromProcess = mm.getTranslationFromProcess(mem.getAddrSpaceId());
+	auto memory = reinterpret_cast<void*>(translationFromProcess.getLocalAddr(0));
+	memset(memory, 0, mem.getSize());
+
+	// start IP
+	return start();
+}
+
+
+
+
+
+bool
+Rtds2Gpu::updateStatus()
+{
+	if(not XRtds2gpu_Get_status_vld(&xInstance))
+		return false;
+
+	status.value = XRtds2gpu_Get_status(&xInstance);
+
+	return true;
+}
+
+size_t
+Rtds2Gpu::getMaxFrameSize()
+{
+	XRtds2gpu_Set_frame_size(&xInstance, 0);
+
+	start();
+	while(not isFinished());
+	updateStatus();
+
+	return status.max_frame_size;
+}
+
+void
+Rtds2Gpu::dumpDoorbell(uint32_t doorbellRegister) const
+{
+	auto& doorbell = reinterpret_cast<reg_doorbell_t&>(doorbellRegister);
+
+	logger->info("Doorbell register: {:#08x}", doorbell.value);
+	logger->info("  Valid:       {}", (doorbell.is_valid ? "yes" : "no"));
+	logger->info("  Count:       {}", doorbell.count);
+	logger->info("  Seq. number: {}", doorbell.seq_nr);
+}
+
+Rtds2GpuFactory::Rtds2GpuFactory() :
+    IpNodeFactory(getName())
+{
+}
+
+} // namespace ip
+} // namespace fpga
+} // namespace villas
diff --git a/fpga/lib/ips/rtds2gpu/xrtds2gpu.c b/fpga/lib/ips/rtds2gpu/xrtds2gpu.c
new file mode 100644
index 000000000..26a5e4f34
--- /dev/null
+++ b/fpga/lib/ips/rtds2gpu/xrtds2gpu.c
@@ -0,0 +1,221 @@
+// ==============================================================
+// File generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC
+// Version: 2017.3
+// Copyright (C) 1986-2017 Xilinx, Inc. All Rights Reserved.
+// 
+// ==============================================================
+
+/***************************** Include Files *********************************/
+#include <villas/fpga/ips/rtds2gpu/xrtds2gpu.h>
+
+/************************** Function Implementation *************************/
+#ifndef __linux__
+int XRtds2gpu_CfgInitialize(XRtds2gpu *InstancePtr, XRtds2gpu_Config *ConfigPtr) {
+    Xil_AssertNonvoid(InstancePtr != NULL);
+    Xil_AssertNonvoid(ConfigPtr != NULL);
+
+    InstancePtr->Ctrl_BaseAddress = ConfigPtr->Ctrl_BaseAddress;
+    InstancePtr->IsReady = XIL_COMPONENT_IS_READY;
+
+    return XST_SUCCESS;
+}
+#endif
+
+void XRtds2gpu_Start(XRtds2gpu *InstancePtr) {
+    u32 Data;
+
+    Xil_AssertVoid(InstancePtr != NULL);
+    Xil_AssertVoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    Data = XRtds2gpu_ReadReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_AP_CTRL) & 0x80;
+    XRtds2gpu_WriteReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_AP_CTRL, Data | 0x01);
+}
+
+u32 XRtds2gpu_IsDone(XRtds2gpu *InstancePtr) {
+    u32 Data;
+
+    Xil_AssertNonvoid(InstancePtr != NULL);
+    Xil_AssertNonvoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    Data = XRtds2gpu_ReadReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_AP_CTRL);
+    return (Data >> 1) & 0x1;
+}
+
+u32 XRtds2gpu_IsIdle(XRtds2gpu *InstancePtr) {
+    u32 Data;
+
+    Xil_AssertNonvoid(InstancePtr != NULL);
+    Xil_AssertNonvoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    Data = XRtds2gpu_ReadReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_AP_CTRL);
+    return (Data >> 2) & 0x1;
+}
+
+u32 XRtds2gpu_IsReady(XRtds2gpu *InstancePtr) {
+    u32 Data;
+
+    Xil_AssertNonvoid(InstancePtr != NULL);
+    Xil_AssertNonvoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    Data = XRtds2gpu_ReadReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_AP_CTRL);
+    // check ap_start to see if the pcore is ready for next input
+    return !(Data & 0x1);
+}
+
+void XRtds2gpu_EnableAutoRestart(XRtds2gpu *InstancePtr) {
+    Xil_AssertVoid(InstancePtr != NULL);
+    Xil_AssertVoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    XRtds2gpu_WriteReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_AP_CTRL, 0x80);
+}
+
+void XRtds2gpu_DisableAutoRestart(XRtds2gpu *InstancePtr) {
+    Xil_AssertVoid(InstancePtr != NULL);
+    Xil_AssertVoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    XRtds2gpu_WriteReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_AP_CTRL, 0);
+}
+
+void XRtds2gpu_Set_baseaddr(XRtds2gpu *InstancePtr, u32 Data) {
+    Xil_AssertVoid(InstancePtr != NULL);
+    Xil_AssertVoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    XRtds2gpu_WriteReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_BASEADDR_DATA, Data);
+}
+
+u32 XRtds2gpu_Get_baseaddr(XRtds2gpu *InstancePtr) {
+    u32 Data;
+
+    Xil_AssertNonvoid(InstancePtr != NULL);
+    Xil_AssertNonvoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    Data = XRtds2gpu_ReadReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_BASEADDR_DATA);
+    return Data;
+}
+
+void XRtds2gpu_Set_data_offset(XRtds2gpu *InstancePtr, u32 Data) {
+    Xil_AssertVoid(InstancePtr != NULL);
+    Xil_AssertVoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    XRtds2gpu_WriteReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_DATA_OFFSET_DATA, Data);
+}
+
+u32 XRtds2gpu_Get_data_offset(XRtds2gpu *InstancePtr) {
+    u32 Data;
+
+    Xil_AssertNonvoid(InstancePtr != NULL);
+    Xil_AssertNonvoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    Data = XRtds2gpu_ReadReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_DATA_OFFSET_DATA);
+    return Data;
+}
+
+void XRtds2gpu_Set_doorbell_offset(XRtds2gpu *InstancePtr, u32 Data) {
+    Xil_AssertVoid(InstancePtr != NULL);
+    Xil_AssertVoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    XRtds2gpu_WriteReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_DOORBELL_OFFSET_DATA, Data);
+}
+
+u32 XRtds2gpu_Get_doorbell_offset(XRtds2gpu *InstancePtr) {
+    u32 Data;
+
+    Xil_AssertNonvoid(InstancePtr != NULL);
+    Xil_AssertNonvoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    Data = XRtds2gpu_ReadReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_DOORBELL_OFFSET_DATA);
+    return Data;
+}
+
+void XRtds2gpu_Set_frame_size(XRtds2gpu *InstancePtr, u32 Data) {
+    Xil_AssertVoid(InstancePtr != NULL);
+    Xil_AssertVoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    XRtds2gpu_WriteReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_FRAME_SIZE_DATA, Data);
+}
+
+u32 XRtds2gpu_Get_frame_size(XRtds2gpu *InstancePtr) {
+    u32 Data;
+
+    Xil_AssertNonvoid(InstancePtr != NULL);
+    Xil_AssertNonvoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    Data = XRtds2gpu_ReadReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_FRAME_SIZE_DATA);
+    return Data;
+}
+
+u32 XRtds2gpu_Get_status(XRtds2gpu *InstancePtr) {
+    u32 Data;
+
+    Xil_AssertNonvoid(InstancePtr != NULL);
+    Xil_AssertNonvoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    Data = XRtds2gpu_ReadReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_STATUS_DATA);
+    return Data;
+}
+
+u32 XRtds2gpu_Get_status_vld(XRtds2gpu *InstancePtr) {
+    u32 Data;
+
+    Xil_AssertNonvoid(InstancePtr != NULL);
+    Xil_AssertNonvoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    Data = XRtds2gpu_ReadReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_STATUS_CTRL);
+    return Data & 0x1;
+}
+
+void XRtds2gpu_InterruptGlobalEnable(XRtds2gpu *InstancePtr) {
+    Xil_AssertVoid(InstancePtr != NULL);
+    Xil_AssertVoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    XRtds2gpu_WriteReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_GIE, 1);
+}
+
+void XRtds2gpu_InterruptGlobalDisable(XRtds2gpu *InstancePtr) {
+    Xil_AssertVoid(InstancePtr != NULL);
+    Xil_AssertVoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    XRtds2gpu_WriteReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_GIE, 0);
+}
+
+void XRtds2gpu_InterruptEnable(XRtds2gpu *InstancePtr, u32 Mask) {
+    u32 Register;
+
+    Xil_AssertVoid(InstancePtr != NULL);
+    Xil_AssertVoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    Register =  XRtds2gpu_ReadReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_IER);
+    XRtds2gpu_WriteReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_IER, Register | Mask);
+}
+
+void XRtds2gpu_InterruptDisable(XRtds2gpu *InstancePtr, u32 Mask) {
+    u32 Register;
+
+    Xil_AssertVoid(InstancePtr != NULL);
+    Xil_AssertVoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    Register =  XRtds2gpu_ReadReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_IER);
+    XRtds2gpu_WriteReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_IER, Register & (~Mask));
+}
+
+void XRtds2gpu_InterruptClear(XRtds2gpu *InstancePtr, u32 Mask) {
+    Xil_AssertVoid(InstancePtr != NULL);
+    Xil_AssertVoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    XRtds2gpu_WriteReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_ISR, Mask);
+}
+
+u32 XRtds2gpu_InterruptGetEnabled(XRtds2gpu *InstancePtr) {
+    Xil_AssertNonvoid(InstancePtr != NULL);
+    Xil_AssertNonvoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    return XRtds2gpu_ReadReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_IER);
+}
+
+u32 XRtds2gpu_InterruptGetStatus(XRtds2gpu *InstancePtr) {
+    Xil_AssertNonvoid(InstancePtr != NULL);
+    Xil_AssertNonvoid(InstancePtr->IsReady == XIL_COMPONENT_IS_READY);
+
+    return XRtds2gpu_ReadReg(InstancePtr->Ctrl_BaseAddress, XRTDS2GPU_CTRL_ADDR_ISR);
+}
+
diff --git a/fpga/lib/ips/switch.cpp b/fpga/lib/ips/switch.cpp
index d2365382c..2d4fa0a15 100644
--- a/fpga/lib/ips/switch.cpp
+++ b/fpga/lib/ips/switch.cpp
@@ -143,7 +143,7 @@ AxiStreamSwitchFactory::configureJson(IpCore& ip, json_t* json_ip)
 
 	auto logger = getLogger();
 
-	auto& axiSwitch = reinterpret_cast<AxiStreamSwitch&>(ip);
+	auto& axiSwitch = dynamic_cast<AxiStreamSwitch&>(ip);
 
 	if(json_unpack(json_ip, "{ s: i }", "num_ports", &axiSwitch.num_ports) != 0) {
 		logger->error("Cannot parse 'num_ports'");
diff --git a/fpga/scripts/hwdef-parse.py b/fpga/scripts/hwdef-parse.py
index f999b5c01..e548ac1d6 100755
--- a/fpga/scripts/hwdef-parse.py
+++ b/fpga/scripts/hwdef-parse.py
@@ -36,7 +36,9 @@ whitelist = [
 	[ 'xilinx.com', 'ip', 'axi_gpio' ],
 	[ 'xilinx.com', 'ip', 'axi_bram_ctrl' ],
 	[ 'xilinx.com', 'ip', 'axis_data_fifo' ],
-	[ 'xilinx.com', 'ip', 'axi_pcie' ]
+	[ 'xilinx.com', 'ip', 'axi_pcie' ],
+	[ 'xilinx.com', 'hls', 'rtds2gpu' ],
+	[ 'xilinx.com', 'hls', 'mem' ]
 ]
 
 # List of VLNI ids of AXI4-Stream infrastructure IP cores which do not alter data
diff --git a/fpga/tests/CMakeLists.txt b/fpga/tests/CMakeLists.txt
index 5db87d33f..1135d90db 100644
--- a/fpga/tests/CMakeLists.txt
+++ b/fpga/tests/CMakeLists.txt
@@ -27,11 +27,13 @@ set(SOURCES
 	dma.cpp
 	fifo.cpp
 	rtds.cpp
+	rtds2gpu.cpp
 	timer.cpp
 )
 
 if(CMAKE_CUDA_COMPILER)
-	list(APPEND SOURCES gpu.cpp)
+	enable_language(CUDA)
+	list(APPEND SOURCES gpu.cpp gpu_kernels.cu)
 endif()
 
 add_executable(unit-tests ${SOURCES})
diff --git a/fpga/tests/dma.cpp b/fpga/tests/dma.cpp
index 63207748f..34a42f72b 100644
--- a/fpga/tests/dma.cpp
+++ b/fpga/tests/dma.cpp
@@ -46,7 +46,7 @@ Test(fpga, dma, .description = "DMA")
 
 		logger->info("Testing {}", *ip);
 
-		auto dma = reinterpret_cast<villas::fpga::ip::Dma&>(*ip);
+		auto dma = dynamic_cast<villas::fpga::ip::Dma&>(*ip);
 
 		if(not dma.loopbackPossible()) {
 			logger->info("Loopback test not possible for {}", *ip);
diff --git a/fpga/tests/fifo.cpp b/fpga/tests/fifo.cpp
index 5ef8e9653..805641352 100644
--- a/fpga/tests/fifo.cpp
+++ b/fpga/tests/fifo.cpp
@@ -46,7 +46,7 @@ Test(fpga, fifo, .description = "FIFO")
 
 		logger->info("Testing {}", *ip);
 
-		auto fifo = reinterpret_cast<villas::fpga::ip::Fifo&>(*ip);
+		auto fifo = dynamic_cast<villas::fpga::ip::Fifo&>(*ip);
 
 		if(not fifo.connectLoopback()) {
 			continue;
diff --git a/fpga/tests/gpu.cpp b/fpga/tests/gpu.cpp
index 6bb3a7dd8..739da2f80 100644
--- a/fpga/tests/gpu.cpp
+++ b/fpga/tests/gpu.cpp
@@ -62,7 +62,7 @@ Test(fpga, gpu_dma, .description = "GPU DMA tests")
 
 		logger->info("Testing {}", *ip);
 
-		auto bram = reinterpret_cast<villas::fpga::ip::Bram*>(ip.get());
+		auto bram = dynamic_cast<villas::fpga::ip::Bram*>(ip.get());
 		cr_assert_not_null(bram, "Couldn't find BRAM");
 
 		count++;
diff --git a/fpga/tests/gpu_kernels.cu b/fpga/tests/gpu_kernels.cu
new file mode 100644
index 000000000..3f5b8b0d4
--- /dev/null
+++ b/fpga/tests/gpu_kernels.cu
@@ -0,0 +1,75 @@
+#include <cstdint>
+#include <cstdio>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <villas/gpu.hpp>
+#include <villas/fpga/ips/rtds2gpu.hpp>
+
+
+
+__global__ void
+gpu_rtds_rtt_kernel(volatile uint32_t* dataIn, volatile reg_doorbell_t* doorbellIn,
+                    volatile uint32_t* dataOut, volatile villas::fpga::ip::ControlRegister* controlRegister,
+                    int* run)
+{
+	printf("[gpu] gpu kernel go\n");
+
+	printf("dataIn:      %p\n", dataIn);
+	printf("doorbellIn:  %p\n", doorbellIn);
+	printf("dataOut:     %p\n", dataOut);
+	printf("control:     %p\n", controlRegister);
+	printf("run:         %p\n", run);
+
+//	*run = reinterpret_cast<bool*>(malloc(sizeof(bool)));
+//	**run = true;
+
+	uint32_t last_seq;
+	while(*run) {
+		// wait for data
+//		printf("[gpu] wait for data, last_seq=%u\n", last_seq);
+		while(not (doorbellIn->is_valid and (last_seq != doorbellIn->seq_nr)) and *run);
+//			printf("doorbell: 0x%08x\n", doorbellIn->value);
+
+		last_seq = doorbellIn->seq_nr;
+
+//		printf("[gpu] copy data\n");
+		for(size_t i = 0; i < doorbellIn->count; i++) {
+			dataOut[i] = dataIn[i];
+		}
+
+		// reset doorbell
+//		printf("[gpu] reset doorbell\n");
+//		doorbellIn->value = 0;
+
+//		printf("[gpu] signal go for gpu2rtds\n");
+		controlRegister->ap_start = 1;
+	}
+
+	printf("kernel done\n");
+}
+
+static int* run = nullptr;
+
+void gpu_rtds_rtt_start(volatile uint32_t* dataIn, volatile reg_doorbell_t* doorbellIn,
+                        volatile uint32_t* dataOut, volatile villas::fpga::ip::ControlRegister* controlRegister)
+{
+	printf("run:         %p\n", run);
+	if(run == nullptr) {
+		run = (int*)malloc(sizeof(uint32_t));
+		cudaHostRegister(run, sizeof(uint32_t), 0);
+	}
+	printf("run:         %p\n", run);
+
+
+	*run = 1;
+	gpu_rtds_rtt_kernel<<<1, 1>>>(dataIn, doorbellIn, dataOut, controlRegister, run);
+	printf("[cpu] kernel launched\n");
+}
+
+void gpu_rtds_rtt_stop()
+{
+	*run = 0;
+	cudaDeviceSynchronize();
+}
diff --git a/fpga/tests/rtds.cpp b/fpga/tests/rtds.cpp
index 0b1067cdd..df88cacdb 100644
--- a/fpga/tests/rtds.cpp
+++ b/fpga/tests/rtds.cpp
@@ -80,8 +80,12 @@ Test(fpga, rtds, .description = "RTDS")
 			auto dmaMaster = dma->getMasterPort(dma->mm2sPort);
 			auto dmaSlave = dma->getSlavePort(dma->s2mmPort);
 
-			rtds->connect(rtdsMaster, dmaSlave);
-			dma->connect(dmaMaster, rtdsSlave);
+//			rtds->connect(*rtds);
+//			logger->info("loopback");
+//			while(1);
+
+//			rtds->connect(rtdsMaster, dmaSlave);
+//			dma->connect(dmaMaster, rtdsSlave);
 
 			auto mem = villas::HostRam::getAllocator().allocate<int32_t>(0x100 / sizeof(int32_t));
 
diff --git a/fpga/tests/rtds2gpu.cpp b/fpga/tests/rtds2gpu.cpp
new file mode 100644
index 000000000..cc3def8ce
--- /dev/null
+++ b/fpga/tests/rtds2gpu.cpp
@@ -0,0 +1,349 @@
+/** FIFO unit test.
+ *
+ * @file
+ * @author Steffen Vogel <stvogel@eonerc.rwth-aachen.de>
+ * @copyright 2017, Steffen Vogel
+ * @license GNU General Public License (version 3)
+ *
+ * VILLASfpga
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *********************************************************************************/
+
+#include <criterion/criterion.h>
+
+#include <iostream>
+
+#include <villas/log.hpp>
+#include <villas/memory.hpp>
+#include <villas/fpga/card.hpp>
+
+#include <villas/fpga/ips/rtds2gpu.hpp>
+#include <villas/fpga/ips/gpu2rtds.hpp>
+#include <villas/fpga/ips/switch.hpp>
+#include <villas/fpga/ips/dma.hpp>
+#include <villas/fpga/ips/rtds.hpp>
+#include <villas/gpu.hpp>
+
+#include "global.hpp"
+
+
+static constexpr size_t SAMPLE_SIZE		= 4;
+static constexpr size_t SAMPLE_COUNT	= 1;
+static constexpr size_t FRAME_SIZE		= SAMPLE_COUNT * SAMPLE_SIZE;
+
+static constexpr size_t DOORBELL_OFFSET = SAMPLE_COUNT;
+static constexpr size_t DATA_OFFSET = 0;
+
+static void dumpMem(const uint32_t* addr, size_t len)
+{
+	const size_t bytesPerLine = 16;
+	const size_t lines = (len) / bytesPerLine + 1;
+	const uint8_t* buf = reinterpret_cast<const uint8_t*>(addr);
+
+	size_t bytesRead = 0;
+
+	for(size_t line = 0; line < lines; line++) {
+		const unsigned base = line * bytesPerLine;
+		printf("0x%04x: ", base);
+
+		for(size_t i = 0; i < bytesPerLine && bytesRead < len; i++) {
+			printf("0x%02x ", buf[base + i]);
+			bytesRead++;
+		}
+		puts("");
+	}
+}
+
+Test(fpga, rtds2gpu_loopback_dma, .description = "Rtds2Gpu")
+{
+	auto logger = loggerGetOrCreate("unittest:rtds2gpu");
+
+	for(auto& ip : state.cards.front()->ips) {
+		if(*ip != villas::fpga::Vlnv("acs.eonerc.rwth-aachen.de:hls:rtds2gpu:"))
+			continue;
+
+		logger->info("Testing {}", *ip);
+
+
+		/* Collect neccessary IPs */
+
+		auto rtds2gpu = dynamic_cast<villas::fpga::ip::Rtds2Gpu&>(*ip);
+
+		auto axiSwitch = dynamic_cast<villas::fpga::ip::AxiStreamSwitch*>(
+		                     state.cards.front()->lookupIp(villas::fpga::Vlnv("xilinx.com:ip:axis_switch:")));
+
+		auto dma = dynamic_cast<villas::fpga::ip::Dma*>(
+		                     state.cards.front()->lookupIp(villas::fpga::Vlnv("xilinx.com:ip:axi_dma:")));
+
+		auto gpu2rtds = dynamic_cast<villas::fpga::ip::Gpu2Rtds*>(
+		                     state.cards.front()->lookupIp(villas::fpga::Vlnv("acs.eonerc.rwth-aachen.de:hls:gpu2rtds:")));
+
+		auto rtds = dynamic_cast<villas::fpga::ip::Rtds*>(
+		                     state.cards.front()->lookupIp(villas::fpga::Vlnv("acs.eonerc.rwth-aachen.de:user:rtds_axis:")));
+
+
+		cr_assert_not_null(axiSwitch, "No AXI switch IP found");
+		cr_assert_not_null(dma, "No DMA IP found");
+		cr_assert_not_null(gpu2rtds, "No Gpu2Rtds IP found");
+		cr_assert_not_null(rtds, "RTDS IP not found");
+
+		rtds2gpu.dump(spdlog::level::debug);
+		gpu2rtds->dump(spdlog::level::debug);
+
+
+		/* Allocate and prepare memory */
+
+		// allocate space for all samples and doorbell register
+		auto dmaMemSrc = villas::HostDmaRam::getAllocator(0).allocate<uint32_t>(SAMPLE_COUNT + 1);
+		auto dmaMemDst = villas::HostDmaRam::getAllocator(0).allocate<uint32_t>(SAMPLE_COUNT + 1);
+		auto dmaMemDst2 = villas::HostDmaRam::getAllocator(0).allocate<uint32_t>(SAMPLE_COUNT + 1);
+
+
+		memset(&dmaMemSrc, 0x11, dmaMemSrc.getMemoryBlock().getSize());
+		memset(&dmaMemDst, 0x55, dmaMemDst.getMemoryBlock().getSize());
+		memset(&dmaMemDst2, 0x77, dmaMemDst2.getMemoryBlock().getSize());
+
+		const uint32_t* dataSrc = &dmaMemSrc[DATA_OFFSET];
+		const uint32_t* dataDst = &dmaMemDst[DATA_OFFSET];
+		const uint32_t* dataDst2 = &dmaMemDst2[0];
+
+		dumpMem(dataSrc, dmaMemSrc.getMemoryBlock().getSize());
+		dumpMem(dataDst, dmaMemDst.getMemoryBlock().getSize());
+		dumpMem(dataDst2, dmaMemDst2.getMemoryBlock().getSize());
+
+
+		// connect AXI Stream from DMA to Rtds2Gpu IP
+		cr_assert(dma->connect(rtds2gpu));
+
+		cr_assert(rtds2gpu.startOnce(dmaMemDst.getMemoryBlock(), SAMPLE_COUNT, DATA_OFFSET * 4, DOORBELL_OFFSET * 4),
+		          "Preparing Rtds2Gpu IP failed");
+
+		cr_assert(dma->write(dmaMemSrc.getMemoryBlock(), FRAME_SIZE),
+		          "Starting DMA MM2S transfer failed");
+
+		cr_assert(dma->writeComplete(),
+		          "DMA failed");
+
+		while(not rtds2gpu.isFinished());
+
+		const uint32_t* doorbellDst = &dmaMemDst[DOORBELL_OFFSET];
+		rtds2gpu.dump(spdlog::level::info);
+		rtds2gpu.dumpDoorbell(*doorbellDst);
+
+		cr_assert(memcmp(dataSrc, dataDst, FRAME_SIZE) == 0, "Memory not equal");
+
+
+		for(size_t i = 0; i < SAMPLE_COUNT; i++) {
+			gpu2rtds->registerFrames[i] = dmaMemDst[i];
+		}
+
+
+		// connect AXI Stream from Gpu2Rtds IP to DMA
+		cr_assert(gpu2rtds->connect(*dma));
+
+		cr_assert(dma->read(dmaMemDst2.getMemoryBlock(), FRAME_SIZE),
+		          "Starting DMA S2MM transfer failed");
+
+		cr_assert(gpu2rtds->startOnce(SAMPLE_COUNT),
+		          "Preparing Gpu2Rtds IP failed");
+
+		cr_assert(dma->readComplete(),
+		          "DMA failed");
+
+		while(not gpu2rtds->isFinished());
+
+		cr_assert(memcmp(dataSrc, dataDst2, FRAME_SIZE) == 0, "Memory not equal");
+
+		dumpMem(dataSrc, dmaMemSrc.getMemoryBlock().getSize());
+		dumpMem(dataDst, dmaMemDst.getMemoryBlock().getSize());
+		dumpMem(dataDst2, dmaMemDst2.getMemoryBlock().getSize());
+
+		logger->info(TXT_GREEN("Passed"));
+	}
+}
+
+Test(fpga, rtds2gpu_rtt_cpu, .description = "Rtds2Gpu RTT via CPU")
+{
+	auto logger = loggerGetOrCreate("unittest:rtds2gpu");
+
+	/* Collect neccessary IPs */
+
+	auto gpu2rtds = dynamic_cast<villas::fpga::ip::Gpu2Rtds*>(
+	                     state.cards.front()->lookupIp(villas::fpga::Vlnv("acs.eonerc.rwth-aachen.de:hls:gpu2rtds:")));
+
+	auto rtds2gpu = dynamic_cast<villas::fpga::ip::Rtds2Gpu*>(
+	                     state.cards.front()->lookupIp(villas::fpga::Vlnv("acs.eonerc.rwth-aachen.de:hls:rtds2gpu:")));
+
+	cr_assert_not_null(gpu2rtds, "No Gpu2Rtds IP found");
+	cr_assert_not_null(rtds2gpu, "No Rtds2Gpu IP not found");
+
+	for(auto& ip : state.cards.front()->ips) {
+		if(*ip != villas::fpga::Vlnv("acs.eonerc.rwth-aachen.de:user:rtds_axis:"))
+			continue;
+
+		auto& rtds = dynamic_cast<villas::fpga::ip::Rtds&>(*ip);
+		logger->info("Testing {}", rtds);
+
+		auto dmaRam = villas::HostDmaRam::getAllocator().allocate<uint32_t>(SAMPLE_COUNT + 1);
+		uint32_t* data = &dmaRam[DATA_OFFSET];
+		uint32_t* doorbell = &dmaRam[DOORBELL_OFFSET];
+
+		// TEST: rtds loopback via switch, this should always work and have RTT=1
+		//cr_assert(rtds.connect(rtds));
+		//logger->info("loopback");
+		//while(1);
+
+		cr_assert(rtds.connect(*rtds2gpu));
+		cr_assert(gpu2rtds->connect(rtds));
+
+
+		for(size_t i = 1; i <= 10000; ) {
+			rtds2gpu->doorbellReset(*doorbell);
+			rtds2gpu->startOnce(dmaRam.getMemoryBlock(), SAMPLE_COUNT, DATA_OFFSET * 4, DOORBELL_OFFSET * 4);
+
+			// Wait by polling rtds2gpu IP or ...
+			// while(not rtds2gpu->isFinished());
+
+			// Wait by polling (local) doorbell register (= just memory)
+			while(not rtds2gpu->doorbellIsValid(*doorbell));
+
+
+			// copy samples to gpu2rtds IP
+			for(size_t i = 0; i < SAMPLE_COUNT; i++) {
+				gpu2rtds->registerFrames[i] = data[i];
+			}
+
+			// Waiting for gpu2rtds is not strictly required
+			gpu2rtds->startOnce(SAMPLE_COUNT);
+			//while(not gpu2rtds->isFinished());
+
+			if(i % 1000 == 0) {
+				logger->info("Successful iterations {}, data {}", i, data[0]);
+				rtds2gpu->dump();
+				rtds2gpu->dumpDoorbell(data[1]);
+			}
+		}
+
+		logger->info(TXT_GREEN("Passed"));
+	}
+}
+
+void gpu_rtds_rtt_start(volatile uint32_t* dataIn, volatile reg_doorbell_t* doorbellIn,
+                        volatile uint32_t* dataOut, volatile villas::fpga::ip::ControlRegister* controlRegister);
+
+void gpu_rtds_rtt_stop();
+
+Test(fpga, rtds2gpu_rtt_gpu, .description = "Rtds2Gpu RTT via GPU")
+{
+	auto logger = loggerGetOrCreate("unittest:rtds2gpu");
+
+	/* Collect neccessary IPs */
+
+	auto gpu2rtds = dynamic_cast<villas::fpga::ip::Gpu2Rtds*>(
+	                     state.cards.front()->lookupIp(villas::fpga::Vlnv("acs.eonerc.rwth-aachen.de:hls:gpu2rtds:")));
+
+	auto rtds2gpu = dynamic_cast<villas::fpga::ip::Rtds2Gpu*>(
+	                     state.cards.front()->lookupIp(villas::fpga::Vlnv("acs.eonerc.rwth-aachen.de:hls:rtds2gpu:")));
+
+	cr_assert_not_null(gpu2rtds, "No Gpu2Rtds IP found");
+	cr_assert_not_null(rtds2gpu, "No Rtds2Gpu IP not found");
+
+	villas::Plugin* plugin = villas::Plugin::lookup(villas::Plugin::Type::Gpu, "");
+	auto gpuPlugin = dynamic_cast<villas::gpu::GpuFactory*>(plugin);
+	cr_assert_not_null(gpuPlugin, "No GPU plugin found");
+
+	auto gpus = gpuPlugin->make();
+	cr_assert(gpus.size() > 0, "No GPUs found");
+
+	// just get first cpu
+	auto& gpu = gpus.front();
+
+	// allocate memory on GPU and make accessible by to PCIe/FPGA
+	auto gpuRam = gpu->getAllocator().allocate<uint32_t>(SAMPLE_COUNT + 1);
+	cr_assert(gpu->makeAccessibleToPCIeAndVA(gpuRam.getMemoryBlock()));
+
+	// make Gpu2Rtds IP register memory on FPGA accessible to GPU
+	cr_assert(gpu->makeAccessibleFromPCIeOrHostRam(gpu2rtds->getRegisterMemory()));
+
+	auto tr = gpu->translate(gpuRam.getMemoryBlock());
+
+	auto dataIn		= reinterpret_cast<uint32_t*>(tr.getLocalAddr(DATA_OFFSET * sizeof(uint32_t)));
+	auto doorbellIn	= reinterpret_cast<reg_doorbell_t*>(tr.getLocalAddr(DOORBELL_OFFSET * sizeof(uint32_t)));
+
+
+	auto gpu2rtdsRegisters = gpu->translate(gpu2rtds->getRegisterMemory());
+
+	auto frameRegister		= reinterpret_cast<uint32_t*>(gpu2rtdsRegisters.getLocalAddr(gpu2rtds->registerFrameOffset));
+	auto controlRegister	= reinterpret_cast<villas::fpga::ip::ControlRegister*>(gpu2rtdsRegisters.getLocalAddr(gpu2rtds->registerControlAddr));
+
+//	auto doorbellInCpu = reinterpret_cast<reg_doorbell_t*>(&gpuRam[DOORBELL_OFFSET]);
+
+	for(auto& ip : state.cards.front()->ips) {
+		if(*ip != villas::fpga::Vlnv("acs.eonerc.rwth-aachen.de:user:rtds_axis:"))
+			continue;
+
+		auto& rtds = dynamic_cast<villas::fpga::ip::Rtds&>(*ip);
+		logger->info("Testing {}", rtds);
+
+
+		// TEST: rtds loopback via switch, this should always work and have RTT=1
+		//cr_assert(rtds.connect(rtds));
+		//logger->info("loopback");
+		//while(1);
+
+		cr_assert(rtds.connect(*rtds2gpu));
+		cr_assert(gpu2rtds->connect(rtds));
+
+		// launch once so they are configured
+		cr_assert(rtds2gpu->startOnce(gpuRam.getMemoryBlock(), SAMPLE_COUNT, DATA_OFFSET * 4, DOORBELL_OFFSET * 4));
+		cr_assert(gpu2rtds->startOnce(SAMPLE_COUNT));
+
+		rtds2gpu->setAutoRestart(true);
+		rtds2gpu->start();
+
+		logger->info("GPU RTT RTDS");
+
+		std::string dummy;
+
+//		logger->info("Press enter to proceed");
+//		std::cin >> dummy;
+
+		gpu_rtds_rtt_start(dataIn, doorbellIn, frameRegister, controlRegister);
+
+//		while(1) {
+//			cr_assert(rtds2gpu->startOnce(gpuRam.getMemoryBlock(), SAMPLE_COUNT, DATA_OFFSET * 4, DOORBELL_OFFSET * 4));
+//		}
+
+//		for(int i = 0; i < 10000; i++) {
+//			while(not doorbellInCpu->is_valid);
+//			logger->debug("received data");
+//		}
+
+//		logger->info("Press enter to cancel");
+//		std::cin >> dummy;
+
+		while(1) {
+			sleep(1);
+//			logger->debug("Current sequence number: {}", doorbellInCpu->seq_nr);
+			logger->debug("Still running");
+		}
+
+		gpu_rtds_rtt_stop();
+
+
+
+		logger->info(TXT_GREEN("Passed"));
+	}
+}
diff --git a/fpga/tests/timer.cpp b/fpga/tests/timer.cpp
index d5f96a446..dd7ccff02 100644
--- a/fpga/tests/timer.cpp
+++ b/fpga/tests/timer.cpp
@@ -45,7 +45,7 @@ Test(fpga, timer, .description = "Timer Counter")
 
 		count++;
 
-		auto timer = reinterpret_cast<villas::fpga::ip::Timer&>(*ip);
+		auto timer = dynamic_cast<villas::fpga::ip::Timer&>(*ip);
 
 		logger->info("Test simple waiting");
 		timer.start(timer.getFrequency() / 10);