From d853d5e0d3accacf99dfab768501cde9d0932c78 Mon Sep 17 00:00:00 2001
From: Daniel Krebs <github@daniel-krebs.net>
Date: Wed, 6 Jun 2018 09:55:14 +0200
Subject: [PATCH] wip GPU RTT

---
 fpga/include/villas/fpga/ip_node.hpp      |   4 +-
 fpga/include/villas/fpga/ips/gpu2rtds.hpp |   6 +-
 fpga/include/villas/fpga/ips/hls.hpp      |   2 +-
 fpga/include/villas/fpga/ips/rtds2gpu.hpp |  13 +++
 fpga/lib/gpu/gpu.cpp                      |   7 ++
 fpga/lib/gpu/include/villas/gpu.hpp       |   3 +
 fpga/lib/ips/rtds2gpu/rtds2gpu.cpp        |   5 +-
 fpga/lib/kernel/vfio.cpp                  |   1 +
 fpga/tests/CMakeLists.txt                 |   3 +-
 fpga/tests/rtds2gpu.cpp                   | 113 +++++++++++++++++++++-
 10 files changed, 148 insertions(+), 9 deletions(-)

diff --git a/fpga/include/villas/fpga/ip_node.hpp b/fpga/include/villas/fpga/ip_node.hpp
index 1257529c2..a89e9842f 100644
--- a/fpga/include/villas/fpga/ip_node.hpp
+++ b/fpga/include/villas/fpga/ip_node.hpp
@@ -71,8 +71,8 @@ public:
 	                        const std::string& port,
 	                        bool isMaster)
 	{
-		for(auto& [vertexId, vertex] : vertices) {
-			(void) vertexId;
+		for(auto& vertexEntry : vertices) {
+			auto& vertex = vertexEntry.second;
 			if(vertex->nodeName == node and vertex->portName == port and vertex->isMaster == isMaster)
 				return vertex;
 		}
diff --git a/fpga/include/villas/fpga/ips/gpu2rtds.hpp b/fpga/include/villas/fpga/ips/gpu2rtds.hpp
index b19702bef..1aab1a2aa 100644
--- a/fpga/include/villas/fpga/ips/gpu2rtds.hpp
+++ b/fpga/include/villas/fpga/ips/gpu2rtds.hpp
@@ -28,10 +28,14 @@ public:
 	getDefaultMasterPort() const
 	{ return getMasterPort(rtdsOutputStreamPort); }
 
+	MemoryBlock
+	getRegisterMemory() const
+	{ return MemoryBlock(0, 1 << 10, getAddressSpaceId(registerMemory)); }
+
 private:
 	bool updateStatus();
 
-private:
+public:
 	static constexpr const char* rtdsOutputStreamPort = "rtds_output";
 
 	struct StatusControlRegister { uint32_t
diff --git a/fpga/include/villas/fpga/ips/hls.hpp b/fpga/include/villas/fpga/ips/hls.hpp
index 685af9050..1184fdfd9 100644
--- a/fpga/include/villas/fpga/ips/hls.hpp
+++ b/fpga/include/villas/fpga/ips/hls.hpp
@@ -92,7 +92,7 @@ protected:
 	{ return { registerMemory }; }
 
 
-private:
+public:
 	/* Register definitions */
 
 	static constexpr uintptr_t registerControlAddr			= 0x00;
diff --git a/fpga/include/villas/fpga/ips/rtds2gpu.hpp b/fpga/include/villas/fpga/ips/rtds2gpu.hpp
index 0b35848cf..b956805c3 100644
--- a/fpga/include/villas/fpga/ips/rtds2gpu.hpp
+++ b/fpga/include/villas/fpga/ips/rtds2gpu.hpp
@@ -11,6 +11,19 @@ namespace villas {
 namespace fpga {
 namespace ip {
 
+union ControlRegister {
+	uint32_t value;
+	struct  { uint32_t
+		ap_start				: 1,
+		ap_done					: 1,
+		ap_idle					: 1,
+		ap_ready				: 1,
+		_res1					: 3,
+		auto_restart			: 1,
+		_res2					: 24;
+	};
+};
+
 
 class Rtds2Gpu : public IpNode, public Hls
 {
diff --git a/fpga/lib/gpu/gpu.cpp b/fpga/lib/gpu/gpu.cpp
index e8f7d58ec..ffb7b7ad3 100644
--- a/fpga/lib/gpu/gpu.cpp
+++ b/fpga/lib/gpu/gpu.cpp
@@ -351,6 +351,13 @@ void Gpu::memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t si
 	cudaDeviceSynchronize();
 }
 
+MemoryTranslation
+Gpu::translate(const MemoryBlock& dst)
+{
+	auto& mm = MemoryManager::get();
+	return mm.getTranslation(masterPciEAddrSpaceId, dst.getAddrSpaceId());
+}
+
 
 std::unique_ptr<villas::MemoryBlock, villas::MemoryBlock::deallocator_fn>
 GpuAllocator::allocateBlock(size_t size)
diff --git a/fpga/lib/gpu/include/villas/gpu.hpp b/fpga/lib/gpu/include/villas/gpu.hpp
index 88b316815..00f1464ca 100644
--- a/fpga/lib/gpu/include/villas/gpu.hpp
+++ b/fpga/lib/gpu/include/villas/gpu.hpp
@@ -36,6 +36,9 @@ public:
 
 	void memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size);
 
+	MemoryTranslation
+	translate(const MemoryBlock& dst);
+
 private:
 	bool registerIoMemory(const MemoryBlock& mem);
 	bool registerHostMemory(const MemoryBlock& mem);
diff --git a/fpga/lib/ips/rtds2gpu/rtds2gpu.cpp b/fpga/lib/ips/rtds2gpu/rtds2gpu.cpp
index 1fa271764..e89574efa 100644
--- a/fpga/lib/ips/rtds2gpu/rtds2gpu.cpp
+++ b/fpga/lib/ips/rtds2gpu/rtds2gpu.cpp
@@ -22,11 +22,10 @@ bool Rtds2Gpu::init()
 	status.value = 0;
 	started = false;
 
-	maxFrameSize = getMaxFrameSize();
+//	maxFrameSize = getMaxFrameSize();
+	maxFrameSize = 16;
 	logger->info("Max. frame size supported: {}", maxFrameSize);
 
-//	maxFrameSize = 16;
-
 	return true;
 }
 
diff --git a/fpga/lib/kernel/vfio.cpp b/fpga/lib/kernel/vfio.cpp
index b9639c1de..b04d172ce 100644
--- a/fpga/lib/kernel/vfio.cpp
+++ b/fpga/lib/kernel/vfio.cpp
@@ -754,6 +754,7 @@ VfioGroup::attach(VfioContainer& container, int groupIndex)
 	          << (container.isIommuEnabled() ? "" : "noiommu-")
 	          << groupIndex;
 
+	logger->debug("path: {}", groupPath.str().c_str());
 	group->fd = open(groupPath.str().c_str(), O_RDWR);
 	if (group->fd < 0) {
 		logger->error("Failed to open VFIO group {}", group->index);
diff --git a/fpga/tests/CMakeLists.txt b/fpga/tests/CMakeLists.txt
index 6eb34491a..36aacf045 100644
--- a/fpga/tests/CMakeLists.txt
+++ b/fpga/tests/CMakeLists.txt
@@ -11,7 +11,8 @@ set(SOURCES
 )
 
 if(CMAKE_CUDA_COMPILER)
-	list(APPEND SOURCES gpu.cpp)
+	enable_language(CUDA)
+	list(APPEND SOURCES gpu.cpp gpu_kernels.cu)
 endif()
 
 add_executable(unit-tests ${SOURCES})
diff --git a/fpga/tests/rtds2gpu.cpp b/fpga/tests/rtds2gpu.cpp
index a453572d9..cc3def8ce 100644
--- a/fpga/tests/rtds2gpu.cpp
+++ b/fpga/tests/rtds2gpu.cpp
@@ -23,6 +23,8 @@
 
 #include <criterion/criterion.h>
 
+#include <iostream>
+
 #include <villas/log.hpp>
 #include <villas/memory.hpp>
 #include <villas/fpga/card.hpp>
@@ -32,6 +34,7 @@
 #include <villas/fpga/ips/switch.hpp>
 #include <villas/fpga/ips/dma.hpp>
 #include <villas/fpga/ips/rtds.hpp>
+#include <villas/gpu.hpp>
 
 #include "global.hpp"
 
@@ -206,7 +209,7 @@ Test(fpga, rtds2gpu_rtt_cpu, .description = "Rtds2Gpu RTT via CPU")
 		cr_assert(gpu2rtds->connect(rtds));
 
 
-		for(size_t i = 1; i <= 10000; i++) {
+		for(size_t i = 1; i <= 10000; ) {
 			rtds2gpu->doorbellReset(*doorbell);
 			rtds2gpu->startOnce(dmaRam.getMemoryBlock(), SAMPLE_COUNT, DATA_OFFSET * 4, DOORBELL_OFFSET * 4);
 
@@ -233,6 +236,114 @@ Test(fpga, rtds2gpu_rtt_cpu, .description = "Rtds2Gpu RTT via CPU")
 			}
 		}
 
+		logger->info(TXT_GREEN("Passed"));
+	}
+}
+
+void gpu_rtds_rtt_start(volatile uint32_t* dataIn, volatile reg_doorbell_t* doorbellIn,
+                        volatile uint32_t* dataOut, volatile villas::fpga::ip::ControlRegister* controlRegister);
+
+void gpu_rtds_rtt_stop();
+
+Test(fpga, rtds2gpu_rtt_gpu, .description = "Rtds2Gpu RTT via GPU")
+{
+	auto logger = loggerGetOrCreate("unittest:rtds2gpu");
+
+	/* Collect neccessary IPs */
+
+	auto gpu2rtds = dynamic_cast<villas::fpga::ip::Gpu2Rtds*>(
+	                     state.cards.front()->lookupIp(villas::fpga::Vlnv("acs.eonerc.rwth-aachen.de:hls:gpu2rtds:")));
+
+	auto rtds2gpu = dynamic_cast<villas::fpga::ip::Rtds2Gpu*>(
+	                     state.cards.front()->lookupIp(villas::fpga::Vlnv("acs.eonerc.rwth-aachen.de:hls:rtds2gpu:")));
+
+	cr_assert_not_null(gpu2rtds, "No Gpu2Rtds IP found");
+	cr_assert_not_null(rtds2gpu, "No Rtds2Gpu IP not found");
+
+	villas::Plugin* plugin = villas::Plugin::lookup(villas::Plugin::Type::Gpu, "");
+	auto gpuPlugin = dynamic_cast<villas::gpu::GpuFactory*>(plugin);
+	cr_assert_not_null(gpuPlugin, "No GPU plugin found");
+
+	auto gpus = gpuPlugin->make();
+	cr_assert(gpus.size() > 0, "No GPUs found");
+
+	// just get first cpu
+	auto& gpu = gpus.front();
+
+	// allocate memory on GPU and make accessible by to PCIe/FPGA
+	auto gpuRam = gpu->getAllocator().allocate<uint32_t>(SAMPLE_COUNT + 1);
+	cr_assert(gpu->makeAccessibleToPCIeAndVA(gpuRam.getMemoryBlock()));
+
+	// make Gpu2Rtds IP register memory on FPGA accessible to GPU
+	cr_assert(gpu->makeAccessibleFromPCIeOrHostRam(gpu2rtds->getRegisterMemory()));
+
+	auto tr = gpu->translate(gpuRam.getMemoryBlock());
+
+	auto dataIn		= reinterpret_cast<uint32_t*>(tr.getLocalAddr(DATA_OFFSET * sizeof(uint32_t)));
+	auto doorbellIn	= reinterpret_cast<reg_doorbell_t*>(tr.getLocalAddr(DOORBELL_OFFSET * sizeof(uint32_t)));
+
+
+	auto gpu2rtdsRegisters = gpu->translate(gpu2rtds->getRegisterMemory());
+
+	auto frameRegister		= reinterpret_cast<uint32_t*>(gpu2rtdsRegisters.getLocalAddr(gpu2rtds->registerFrameOffset));
+	auto controlRegister	= reinterpret_cast<villas::fpga::ip::ControlRegister*>(gpu2rtdsRegisters.getLocalAddr(gpu2rtds->registerControlAddr));
+
+//	auto doorbellInCpu = reinterpret_cast<reg_doorbell_t*>(&gpuRam[DOORBELL_OFFSET]);
+
+	for(auto& ip : state.cards.front()->ips) {
+		if(*ip != villas::fpga::Vlnv("acs.eonerc.rwth-aachen.de:user:rtds_axis:"))
+			continue;
+
+		auto& rtds = dynamic_cast<villas::fpga::ip::Rtds&>(*ip);
+		logger->info("Testing {}", rtds);
+
+
+		// TEST: rtds loopback via switch, this should always work and have RTT=1
+		//cr_assert(rtds.connect(rtds));
+		//logger->info("loopback");
+		//while(1);
+
+		cr_assert(rtds.connect(*rtds2gpu));
+		cr_assert(gpu2rtds->connect(rtds));
+
+		// launch once so they are configured
+		cr_assert(rtds2gpu->startOnce(gpuRam.getMemoryBlock(), SAMPLE_COUNT, DATA_OFFSET * 4, DOORBELL_OFFSET * 4));
+		cr_assert(gpu2rtds->startOnce(SAMPLE_COUNT));
+
+		rtds2gpu->setAutoRestart(true);
+		rtds2gpu->start();
+
+		logger->info("GPU RTT RTDS");
+
+		std::string dummy;
+
+//		logger->info("Press enter to proceed");
+//		std::cin >> dummy;
+
+		gpu_rtds_rtt_start(dataIn, doorbellIn, frameRegister, controlRegister);
+
+//		while(1) {
+//			cr_assert(rtds2gpu->startOnce(gpuRam.getMemoryBlock(), SAMPLE_COUNT, DATA_OFFSET * 4, DOORBELL_OFFSET * 4));
+//		}
+
+//		for(int i = 0; i < 10000; i++) {
+//			while(not doorbellInCpu->is_valid);
+//			logger->debug("received data");
+//		}
+
+//		logger->info("Press enter to cancel");
+//		std::cin >> dummy;
+
+		while(1) {
+			sleep(1);
+//			logger->debug("Current sequence number: {}", doorbellInCpu->seq_nr);
+			logger->debug("Still running");
+		}
+
+		gpu_rtds_rtt_stop();
+
+
+
 		logger->info(TXT_GREEN("Passed"));
 	}
 }