From 498af9fd1c5b23e084382adfa51dd95677527a14 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Fri, 6 Jan 2023 17:21:47 +0100
Subject: [PATCH] ips/dma: make read correctly wait on interrupts

Modify villas-fpga-ctrl to fit the new behavior of Dma.
Makes reading from DMA work even when we are too slow and
only receive partial batches of BDs.

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 fpga/include/villas/fpga/ips/dma.hpp |  3 +-
 fpga/lib/ips/dma.cpp                 | 42 ++++++++--------------------
 fpga/src/villas-fpga-ctrl.cpp        | 41 ++++++++++++++-------------
 3 files changed, 35 insertions(+), 51 deletions(-)
diff --git a/fpga/include/villas/fpga/ips/dma.hpp b/fpga/include/villas/fpga/ips/dma.hpp
index 5f4f21a0f..ed29786f2 100644
--- a/fpga/include/villas/fpga/ips/dma.hpp
+++ b/fpga/include/villas/fpga/ips/dma.hpp
@@ -133,7 +133,8 @@ private:
 	// When using SG: ringBdSize is the maximum number of BDs usable in the ring
 	// Depending on alignment, the actual number of BDs usable can be smaller
 	static constexpr size_t requestedRingBdSize = 2048;
-	uint32_t actualRingBdSize = XAxiDma_BdRingCntCalc(XAXIDMA_BD_MINIMUM_ALIGNMENT, requestedRingBdSize);
+	static constexpr size_t requestedRingBdSizeMemory = requestedRingBdSize * sizeof(XAxiDma_Bd);
+	uint32_t actualRingBdSize = XAxiDma_BdRingCntCalc(XAXIDMA_BD_MINIMUM_ALIGNMENT, requestedRingBdSizeMemory);
 	std::shared_ptr<MemoryBlock> sgRingTx;
 	std::shared_ptr<MemoryBlock> sgRingRx;
 };
diff --git a/fpga/lib/ips/dma.cpp b/fpga/lib/ips/dma.cpp
index c129b6f82..92aae09ae 100644
--- a/fpga/lib/ips/dma.cpp
+++ b/fpga/lib/ips/dma.cpp
@@ -82,7 +82,7 @@ void Dma::setupScatterGatherRingRx()
 
 	// Allocate and map space for BD ring in host RAM
 	auto &alloc = villas::HostRam::getAllocator();
-	sgRingRx = alloc.allocateBlock(requestedRingBdSize * sizeof(uint16_t) * XAXIDMA_BD_NUM_WORDS);
+	sgRingRx = alloc.allocateBlock(requestedRingBdSizeMemory);
 
 	if (not card->mapMemoryBlock(sgRingRx))
 		throw RuntimeError("Memory not accessible by DMA");
@@ -127,7 +127,7 @@ void Dma::setupScatterGatherRingTx()
 
 	// Allocate and map space for BD ring in host RAM
 	auto &alloc = villas::HostRam::getAllocator();
-	sgRingTx = alloc.allocateBlock(requestedRingBdSize * sizeof(uint16_t) * XAXIDMA_BD_NUM_WORDS);
+	sgRingTx = alloc.allocateBlock(requestedRingBdSizeMemory);
 
 	if (not card->mapMemoryBlock(sgRingTx))
 		throw RuntimeError("Memory not accessible by DMA");
@@ -256,8 +256,6 @@ bool Dma::read(const MemoryBlock &mem, size_t len)
 	if (buf == nullptr)
 		throw RuntimeError("Buffer was null");
 
-	logger->debug("Read from stream and write to address {:p}", buf);
-
 	return hasScatterGather() ? readScatterGather(buf, len) : readSimple(buf, len);
 }
 
@@ -396,43 +394,27 @@ Dma::readCompleteScatterGather()
 	auto rxRing = XAxiDma_GetRxRing(&xDma);
 	int ret = XST_FAILURE;
 	size_t bytesRead = 0;
+	static size_t errcnt = 32;
+
+	//auto intrNum =
+	irqs[s2mmInterrupt].irqController->waitForInterrupt(irqs[s2mmInterrupt].num);
 
 	// Wait until the data has been received by the RX channel.
 	if ((processedBds = XAxiDma_BdRingFromHw(rxRing, readCoalesce, &bd)) < readCoalesce)
 	{
-		if (processedBds != 0) {
-			//Ignore partial batches
-			logger->warn("Ignoring partial batch of {} BDs.", processedBds);
-			ret = XAxiDma_BdRingFree(rxRing, processedBds, bd);
-			if (ret != XST_SUCCESS)
-				throw RuntimeError("Failed to free {} RX BDs {}", processedBds, ret);
+		logger->warn("Got partial batch of {}/{} BDs.", processedBds, readCoalesce);
+		if(errcnt-- == 0) {
+			throw RuntimeError("too many partial batches");
 		}
-		//auto intrNum =
-		irqs[s2mmInterrupt].irqController->waitForInterrupt(irqs[s2mmInterrupt].num);
-		//If we got a partial batch on the first call, we have to receive up to readCoalesce*2
-		//to make sure we get a full batch of readCoalesce messages
-		processedBds = XAxiDma_BdRingFromHw(rxRing, readCoalesce*2, &bd);
 	}
-	if(processedBds < readCoalesce) {
-		// We got less than we expected. We already tried two times so let's give up.
-		throw RuntimeError("Read only {} BDs, expected {}.", processedBds, readCoalesce);
-	} else if(processedBds > readCoalesce) {
-		// If the first try was a partial batch, we receive two batches on the second try
-		// We ignore the first batch and only process the second one
-		while (processedBds > readCoalesce) {
-			bd = (XAxiDma_Bd *) XAxiDma_BdRingNext(rxRing, bd);
-			processedBds--;
-		}
-		ret = XAxiDma_BdRingFree(rxRing, processedBds-readCoalesce, bd);
-		if (ret != XST_SUCCESS)
-			throw RuntimeError("Failed to free {} RX BDs {}", processedBds, ret);
-	}
-	// At this point we have exactly readCoalesce BDs.
 
 	// Acknowledge the interrupt. Has no effect if no interrupt has occured.
 	auto irqStatus = XAxiDma_BdRingGetIrq(rxRing);
 	XAxiDma_BdRingAckIrq(rxRing, irqStatus);
 
+	if (processedBds == 0)
+		return 0;
+
 	if (bd == nullptr)
 		throw RuntimeError("Bd was null.");
 
diff --git a/fpga/src/villas-fpga-ctrl.cpp b/fpga/src/villas-fpga-ctrl.cpp
index a15112bb1..b6de02200 100644
--- a/fpga/src/villas-fpga-ctrl.cpp
+++ b/fpga/src/villas-fpga-ctrl.cpp
@@ -53,37 +53,38 @@ void readFromDmaToStdOut(std::shared_ptr<villas::fpga::ip::Dma> dma)
 	auto &mm = MemoryManager::get();
 	mm.getGraph().dump("graph.dot");
 
+
+	size_t cur = 0, next = 1;
+	std::ios::sync_with_stdio(false);
+	size_t samplecnt = 0;
+	static const char outputfmt[] = "%05zd: %7f\n";
+	static const size_t outputfmtSize = 16;
+	char outputbuf[16][outputfmtSize] = {0};
+	size_t bytesRead;
+
 	// Setup read transfer
 	dma->read(*block[0], block[0]->getSize());
-	size_t cur = 0, next = 1;
+
 	while (true) {
+		//logger->debug("Read from stream and write to address {:p}", *block[next]);
 		dma->read(*block[next], block[next]->getSize());
-		auto bytesRead = dma->readComplete();
-		// Setup read transfer
-
-		//auto valuesRead = bytesRead / sizeof(int32_t);
-		//logger->info("Read {} bytes", bytesRead);
-
-		//for (size_t i = 0; i < valuesRead; i++)
-		//	std::cerr << std::hex << mem[i] << ";";
-		//std::cerr << std::endl;
+		bytesRead = dma->readComplete();
 
 		for (size_t i = 0; i*4 < bytesRead; i++) {
 			int32_t ival = mem[cur][i];
 			float fval = *((float*)(&ival)); // cppcheck-suppress invalidPointerCast
 			//std::cerr << std::hex << ival << ",";
-			std::cerr << fval << std::endl;
-			/*int64_t ival = (int64_t)(mem[1] & 0xFFFF) << 48 |
-				(int64_t)(mem[1] & 0xFFFF0000) << 16 |
-				(int64_t)(mem[0] & 0xFFFF) << 16 |
-				(int64_t)(mem[0] & 0xFFFF0000) >> 16;
-			double dval = *((double*)(&ival));
-			std::cerr << std::hex << ival << "," << dval << std::endl;
-			bytesRead -= 8;*/
-			//logger->info("Read value: {}", dval);
+			//std::cout << samplecnt++ << ": " << fval << '\n';
+			if (std::snprintf(outputbuf[i], outputfmtSize+1, outputfmt, (samplecnt++%100000), fval) > (int)outputfmtSize) {
+				throw RuntimeError("Output buffer too small");
+			}
 		}
+		for (size_t i = 0; i < sizeof(outputbuf)/sizeof(outputbuf[0])-bytesRead/4; i++) {
+			outputbuf[i][0] = '\0';
+		}
+		std::cout << *outputbuf << std::flush;
 		cur = next;
-		next = (next + 1) % (sizeof(mem)/sizeof(mem[0]));
+		next = (next + 1) % (sizeof(mem) / sizeof(mem[0]));
 	}
 }