From 57d7396c09874fec1872f91c355b5e7c01326f1e Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 20 Mar 2024 14:57:16 +0100
Subject: [PATCH 01/10] fpga: optimize sg descriptor rings

we are now using only one memory block for both sg rings. This is
required so that the SG interface can benefit from a read cache

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 fpga/include/villas/fpga/ips/dma.hpp | 14 +++---
 fpga/lib/ips/dma.cpp                 | 71 ++++++++++++----------------
 2 files changed, 38 insertions(+), 47 deletions(-)
diff --git a/fpga/include/villas/fpga/ips/dma.hpp b/fpga/include/villas/fpga/ips/dma.hpp
index e29cc093a..c047735a3 100644
--- a/fpga/include/villas/fpga/ips/dma.hpp
+++ b/fpga/include/villas/fpga/ips/dma.hpp
@@ -89,8 +89,8 @@ private:
   Completion readCompleteSimple();
 
   void setupScatterGather();
-  void setupScatterGatherRingRx();
-  void setupScatterGatherRingTx();
+  void setupScatterGatherRingRx(uintptr_t physAddr, uintptr_t virtAddr);
+  void setupScatterGatherRingTx(uintptr_t physAddr, uintptr_t virtAddr);
 
   static constexpr char registerMemory[] = "Reg";
 
@@ -115,6 +115,7 @@ private:
   bool configDone = false;
   // use polling to wait for DMA completion or interrupts via efds
   bool polling = false;
+  bool cyclic = false;
   // Timeout after which the DMA controller issues in interrupt if no data has been received
   // Delay is 125 x <delay> x (clock period of SG clock). SG clock is 100 MHz by default.
   int delay = 0;
@@ -128,13 +129,12 @@ private:
 
   // When using SG: ringBdSize is the maximum number of BDs usable in the ring
   // Depending on alignment, the actual number of BDs usable can be smaller
-  static constexpr size_t requestedRingBdSize = 2048;
+  static constexpr size_t requestedRingBdSize = 1;
   static constexpr size_t requestedRingBdSizeMemory =
       requestedRingBdSize * sizeof(XAxiDma_Bd);
-  uint32_t actualRingBdSize = XAxiDma_BdRingCntCalc(
-      XAXIDMA_BD_MINIMUM_ALIGNMENT, requestedRingBdSizeMemory);
-  std::shared_ptr<MemoryBlock> sgRingTx;
-  std::shared_ptr<MemoryBlock> sgRingRx;
+  uint32_t actualRingBdSize = 1; //XAxiDma_BdRingCntCalc(
+      //XAXIDMA_BD_MINIMUM_ALIGNMENT, requestedRingBdSizeMemory);
+  std::shared_ptr<MemoryBlock> sgRing;
 };
 
 class DmaFactory : NodeFactory {
diff --git a/fpga/lib/ips/dma.cpp b/fpga/lib/ips/dma.cpp
index 3442fdc18..6c3d56461 100644
--- a/fpga/lib/ips/dma.cpp
+++ b/fpga/lib/ips/dma.cpp
@@ -9,6 +9,7 @@
 #include <sstream>
 #include <string>
 
+#include <sys/types.h>
 #include <xilinx/xaxidma.h>
 
 #include <villas/memory.hpp>
@@ -48,10 +49,9 @@ bool Dma::init() {
   hwLock.unlock();
   // Map buffer descriptors
   if (hasScatterGather()) {
-    if (actualRingBdSize < 2 * readCoalesce ||
-        actualRingBdSize < 2 * writeCoalesce) {
+    if (actualRingBdSize < readCoalesce || actualRingBdSize < writeCoalesce) {
       throw RuntimeError(
-          "Ring buffer size is too small for coalesce value {} < 2*{}",
+          "Ring buffer size is too small for coalesce value {} < {}",
           actualRingBdSize, std::max(readCoalesce, writeCoalesce));
     }
     setupScatterGather();
@@ -67,11 +67,27 @@ bool Dma::init() {
 }
 
 void Dma::setupScatterGather() {
-  setupScatterGatherRingRx();
-  setupScatterGatherRingTx();
+  // Allocate and map space for BD ring in host RAM
+  auto &alloc = villas::HostRam::getAllocator();
+  sgRing = alloc.allocateBlock(2 * requestedRingBdSizeMemory);
+
+  if (not card->mapMemoryBlock(sgRing))
+    throw RuntimeError("Memory not accessible by DMA");
+
+  auto &mm = MemoryManager::get();
+  auto trans = mm.getTranslation(busMasterInterfaces[sgInterface],
+                                 sgRing->getAddrSpaceId());
+
+  auto physAddr = reinterpret_cast<uintptr_t>(trans.getLocalAddr(0));
+  auto virtAddr = reinterpret_cast<uintptr_t>(
+      mm.getTranslationFromProcess(sgRing->getAddrSpaceId()).getLocalAddr(0));
+  setupScatterGatherRingRx(physAddr, virtAddr);
+
+  setupScatterGatherRingTx(physAddr + requestedRingBdSizeMemory,
+                           virtAddr + requestedRingBdSizeMemory);
 }
 
-void Dma::setupScatterGatherRingRx() {
+void Dma::setupScatterGatherRingRx(uintptr_t physAddr, uintptr_t virtAddr) {
   int ret;
 
   hwLock.lock();
@@ -83,20 +99,6 @@ void Dma::setupScatterGatherRingRx() {
   // Set delay and coalescing
   XAxiDma_BdRingSetCoalesce(rxRingPtr, readCoalesce, delay);
 
-  // Allocate and map space for BD ring in host RAM
-  auto &alloc = villas::HostRam::getAllocator();
-  sgRingRx = alloc.allocateBlock(requestedRingBdSizeMemory);
-
-  if (not card->mapMemoryBlock(sgRingRx))
-    throw RuntimeError("Memory not accessible by DMA");
-
-  auto &mm = MemoryManager::get();
-  auto trans = mm.getTranslation(busMasterInterfaces[sgInterface],
-                                 sgRingRx->getAddrSpaceId());
-  auto physAddr = reinterpret_cast<uintptr_t>(trans.getLocalAddr(0));
-  auto virtAddr = reinterpret_cast<uintptr_t>(
-      mm.getTranslationFromProcess(sgRingRx->getAddrSpaceId()).getLocalAddr(0));
-
   // Setup Rx BD space
   ret = XAxiDma_BdRingCreate(rxRingPtr, physAddr, virtAddr,
                              XAXIDMA_BD_MINIMUM_ALIGNMENT, actualRingBdSize);
@@ -111,6 +113,11 @@ void Dma::setupScatterGatherRingRx() {
   if (ret != XST_SUCCESS)
     throw RuntimeError("Failed to clone BD template: {}", ret);
 
+  if (cyclic) {
+    /* Enable Cyclic DMA mode */
+    XAxiDma_BdRingEnableCyclicDMA(rxRingPtr);
+    XAxiDma_SelectCyclicMode(&xDma, XAXIDMA_DEVICE_TO_DMA, 1);
+  }
   // Enable completion interrupt
   XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DEVICE_TO_DMA);
   // Start the RX channel
@@ -121,7 +128,7 @@ void Dma::setupScatterGatherRingRx() {
   hwLock.unlock();
 }
 
-void Dma::setupScatterGatherRingTx() {
+void Dma::setupScatterGatherRingTx(uintptr_t physAddr, uintptr_t virtAddr) {
   int ret;
 
   hwLock.lock();
@@ -133,20 +140,6 @@ void Dma::setupScatterGatherRingTx() {
   // Set TX delay and coalesce
   XAxiDma_BdRingSetCoalesce(txRingPtr, writeCoalesce, delay);
 
-  // Allocate and map space for BD ring in host RAM
-  auto &alloc = villas::HostRam::getAllocator();
-  sgRingTx = alloc.allocateBlock(requestedRingBdSizeMemory);
-
-  if (not card->mapMemoryBlock(sgRingTx))
-    throw RuntimeError("Memory not accessible by DMA");
-
-  auto &mm = MemoryManager::get();
-  auto trans = mm.getTranslation(busMasterInterfaces[sgInterface],
-                                 sgRingTx->getAddrSpaceId());
-  auto physAddr = reinterpret_cast<uintptr_t>(trans.getLocalAddr(0));
-  auto virtAddr = reinterpret_cast<uintptr_t>(
-      mm.getTranslationFromProcess(sgRingTx->getAddrSpaceId()).getLocalAddr(0));
-
   // Setup TxBD space
   ret = XAxiDma_BdRingCreate(txRingPtr, physAddr, virtAddr,
                              XAXIDMA_BD_MINIMUM_ALIGNMENT, actualRingBdSize);
@@ -163,6 +156,7 @@ void Dma::setupScatterGatherRingTx() {
 
   // Enable completion interrupt
   XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DMA_TO_DEVICE);
+
   // Start the TX channel
   ret = XAxiDma_BdRingStart(txRingPtr);
   if (ret != XST_SUCCESS)
@@ -214,11 +208,8 @@ Dma::~Dma() {
       rxRingPtr->CyclicBd = nullptr;
     }
     // unampe SG memory Blocks
-    if (sgRingTx) {
-      card->unmapMemoryBlock(*sgRingTx);
-    }
-    if (sgRingRx) {
-      card->unmapMemoryBlock(*sgRingRx);
+    if (sgRing) {
+      card->unmapMemoryBlock(*sgRing);
     }
   }
   Dma::reset();

From 483293aec84eab39b17b165f79ed12e3c118bdff Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Wed, 20 Mar 2024 16:57:46 +0100
Subject: [PATCH 02/10] fpga: turn off all interrupts when using polling

this improves the latency by at least 4 us in my setup.

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 fpga/lib/ips/dma.cpp | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/fpga/lib/ips/dma.cpp b/fpga/lib/ips/dma.cpp
index 6c3d56461..3e915b7e3 100644
--- a/fpga/lib/ips/dma.cpp
+++ b/fpga/lib/ips/dma.cpp
@@ -119,7 +119,9 @@ void Dma::setupScatterGatherRingRx(uintptr_t physAddr, uintptr_t virtAddr) {
     XAxiDma_SelectCyclicMode(&xDma, XAXIDMA_DEVICE_TO_DMA, 1);
   }
   // Enable completion interrupt
-  XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DEVICE_TO_DMA);
+  if (!polling) {
+    XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DEVICE_TO_DMA);
+  }
   // Start the RX channel
   ret = XAxiDma_BdRingStart(rxRingPtr);
   if (ret != XST_SUCCESS)
@@ -155,8 +157,9 @@ void Dma::setupScatterGatherRingTx(uintptr_t physAddr, uintptr_t virtAddr) {
     throw RuntimeError("Failed to clone TX ring BD: {}", ret);
 
   // Enable completion interrupt
-  XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DMA_TO_DEVICE);
-
+  if (!polling) {
+    XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DMA_TO_DEVICE);
+  }
   // Start the TX channel
   ret = XAxiDma_BdRingStart(txRingPtr);
   if (ret != XST_SUCCESS)
@@ -410,10 +413,6 @@ Dma::Completion Dma::writeCompleteScatterGather() {
     // PCIe address space, yet. The subsequent DMA Controller management can be done in a
     // separate thread to keep latencies in this thread extremly low. We know that we have
     // received one BD.
-    do {
-      // This takes 1.5 us
-      irqStatus = XAxiDma_BdRingGetIrq(txRing);
-    } while (!(irqStatus & XAXIDMA_IRQ_IOC_MASK));
   } else {
     c.interrupts = irqs[mm2sInterrupt].irqController->waitForInterrupt(
         irqs[mm2sInterrupt].num);
@@ -432,8 +431,8 @@ Dma::Completion Dma::writeCompleteScatterGather() {
   // Acknowledge the interrupt
   if (!polling) {
     irqStatus = XAxiDma_BdRingGetIrq(txRing);
+    XAxiDma_BdRingAckIrq(txRing, irqStatus);
   }
-  XAxiDma_BdRingAckIrq(txRing, irqStatus);
 
   if (c.bds == 0) {
     c.bytes = 0;
@@ -490,10 +489,6 @@ Dma::Completion Dma::readCompleteScatterGather() {
     // PCIe address space, yet. The subsequent DMA Controller management can be done in a
     // separate thread to keep latencies in this thread extremly low. We know that we have
     // received one BD.
-    do {
-      // This takes 1.5 us
-      irqStatus = XAxiDma_BdRingGetIrq(rxRing);
-    } while (!(irqStatus & XAXIDMA_IRQ_IOC_MASK));
     intrs = 1;
   } else {
     intrs = irqs[s2mmInterrupt].irqController->waitForInterrupt(
@@ -517,13 +512,13 @@ Dma::Completion Dma::readCompleteScatterGather() {
   }
   if (!polling) {
     irqStatus = XAxiDma_BdRingGetIrq(rxRing);
+    XAxiDma_BdRingAckIrq(rxRing, irqStatus);
+    if (!(irqStatus & XAXIDMA_IRQ_IOC_MASK)) {
+      logger->error("Expected IOC interrupt but IRQ status is: {:#x}",
+                    irqStatus);
+      return c;
+    }
   }
-  XAxiDma_BdRingAckIrq(rxRing, irqStatus);
-  if (!(irqStatus & XAXIDMA_IRQ_IOC_MASK)) {
-    logger->error("Expected IOC interrupt but IRQ status is: {:#x}", irqStatus);
-    return c;
-  }
-
   // Wait until the data has been received by the RX channel.
   if ((c.bds = XAxiDma_BdRingFromHw(rxRing, readCoalesce, &bd)) <
       readCoalesce) {

From 248a4b3a0d76b7113033427502949fa54c74f44f Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Sat, 23 Mar 2024 18:59:24 +0100
Subject: [PATCH 03/10] fpga: improve dma latency

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 fpga/include/villas/fpga/ips/dma.hpp | 19 +++---
 fpga/lib/ips/dma.cpp                 | 45 +++++++------
 include/villas/nodes/fpga.hpp        | 30 ++++++++-
 lib/nodes/fpga.cpp                   | 95 +++++++++++++++++++++-------
 4 files changed, 136 insertions(+), 53 deletions(-)

diff --git a/fpga/include/villas/fpga/ips/dma.hpp b/fpga/include/villas/fpga/ips/dma.hpp
index c047735a3..10d5108b7 100644
--- a/fpga/include/villas/fpga/ips/dma.hpp
+++ b/fpga/include/villas/fpga/ips/dma.hpp
@@ -49,6 +49,7 @@ public:
                               : writeCompleteSimple();
   }
 
+  size_t pollReadScatterGather(bool lock);
   Completion readComplete() {
     return hasScatterGather() ? readCompleteScatterGather()
                               : readCompleteSimple();
@@ -61,11 +62,11 @@ public:
 
   inline bool hasScatterGather() const { return xConfig.HasSg; }
 
-  const StreamVertex &getDefaultSlavePort() const {
+  const StreamVertex &getDefaultSlavePort() const override {
     return getSlavePort(s2mmPort);
   }
 
-  const StreamVertex &getDefaultMasterPort() const {
+  const StreamVertex &getDefaultMasterPort() const override {
     return getMasterPort(mm2sPort);
   }
 
@@ -103,7 +104,7 @@ private:
   // Optional Scatter-Gather interface to access descriptors
   static constexpr char sgInterface[] = "M_AXI_SG";
 
-  std::list<MemoryBlockName> getMemoryBlocks() const {
+  std::list<MemoryBlockName> getMemoryBlocks() const override {
     return {registerMemory};
   }
 
@@ -114,8 +115,8 @@ private:
 
   bool configDone = false;
   // use polling to wait for DMA completion or interrupts via efds
-  bool polling = false;
-  bool cyclic = false;
+  bool polling = false; // polling mode is significantly lower latency
+  bool cyclic = false;  // not fully implemented
   // Timeout after which the DMA controller issues in interrupt if no data has been received
   // Delay is 125 x <delay> x (clock period of SG clock). SG clock is 100 MHz by default.
   int delay = 0;
@@ -140,19 +141,19 @@ private:
 class DmaFactory : NodeFactory {
 
 public:
-  virtual std::string getName() const { return "dma"; }
+  virtual std::string getName() const override { return "dma"; }
 
-  virtual std::string getDescription() const {
+  virtual std::string getDescription() const override {
     return "Xilinx's AXI4 Direct Memory Access Controller";
   }
 
 private:
-  virtual Vlnv getCompatibleVlnv() const {
+  virtual Vlnv getCompatibleVlnv() const override {
     return Vlnv("xilinx.com:ip:axi_dma:");
   }
 
   // Create a concrete IP instance
-  Core *make() const { return new Dma; };
+  Core *make() const override { return new Dma; };
 
 protected:
   virtual void parse(Core &ip, json_t *json) override;
diff --git a/fpga/lib/ips/dma.cpp b/fpga/lib/ips/dma.cpp
index 3e915b7e3..cf973bd21 100644
--- a/fpga/lib/ips/dma.cpp
+++ b/fpga/lib/ips/dma.cpp
@@ -9,6 +9,8 @@
 #include <sstream>
 #include <string>
 
+#include "xilinx/xaxidma_bd.h"
+#include "xilinx/xaxidma_hw.h"
 #include <sys/types.h>
 #include <xilinx/xaxidma.h>
 
@@ -334,7 +336,7 @@ bool Dma::writeScatterGather(const void *buf, size_t len) {
 }
 
 bool Dma::readScatterGather(void *buf, size_t len) {
-  int ret = XST_FAILURE;
+  uint32_t ret = XST_FAILURE;
 
   if (len < readCoalesce * readMsgSize) {
     throw RuntimeError(
@@ -397,7 +399,7 @@ Dma::Completion Dma::writeCompleteScatterGather() {
   Completion c;
   XAxiDma_Bd *bd = nullptr, *curBd;
   auto txRing = XAxiDma_GetTxRing(&xDma);
-  int ret = XST_FAILURE;
+  uint32_t ret = XST_FAILURE;
   static size_t errcnt = 32;
 
   uint32_t irqStatus = 0;
@@ -410,9 +412,7 @@ Dma::Completion Dma::writeCompleteScatterGather() {
       BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
     } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
     // At this point, we know that the transmission is complete, but we haven't accessed the
-    // PCIe address space, yet. The subsequent DMA Controller management can be done in a
-    // separate thread to keep latencies in this thread extremly low. We know that we have
-    // received one BD.
+    // PCIe address space, yet. We know that we have received at least one BD.
   } else {
     c.interrupts = irqs[mm2sInterrupt].irqController->waitForInterrupt(
         irqs[mm2sInterrupt].num);
@@ -451,7 +451,7 @@ Dma::Completion Dma::writeCompleteScatterGather() {
     if ((ret & XAXIDMA_BD_STS_ALL_ERR_MASK) ||
         (!(ret & XAXIDMA_BD_STS_COMPLETE_MASK))) {
       hwLock.unlock();
-      throw RuntimeError("Bd Status register shows error: {}", ret);
+      throw RuntimeError("Write: Bd Status register shows error: {:#x}", ret);
     }
 
     c.bytes += XAxiDma_BdGetLength(bd, txRing->MaxTransferLen);
@@ -468,6 +468,25 @@ Dma::Completion Dma::writeCompleteScatterGather() {
   return c;
 }
 
+size_t Dma::pollReadScatterGather(bool lock) {
+  if (lock) {
+    hwLock.lock();
+  }
+  auto rxRing = XAxiDma_GetRxRing(&xDma);
+  XAxiDma_Bd *CurBdPtr = rxRing->HwHead;
+  volatile uint32_t BdSts;
+  // Poll BD status to avoid accessing PCIe address space
+  do {
+    BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
+  } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
+  // At this point, we know that the transmission is complete, but we haven't accessed the
+  // PCIe address space, yet. We know that we have received at least one BD.
+  if (lock) {
+    hwLock.unlock();
+  }
+  return XAxiDma_BdGetActualLength(CurBdPtr, XAXIDMA_MCHAN_MAX_TRANSFER_LEN);
+}
+
 Dma::Completion Dma::readCompleteScatterGather() {
   Completion c;
   XAxiDma_Bd *bd = nullptr, *curBd;
@@ -479,16 +498,7 @@ Dma::Completion Dma::readCompleteScatterGather() {
   uint32_t irqStatus = 0;
   if (polling) {
     hwLock.lock();
-    XAxiDma_Bd *CurBdPtr = rxRing->HwHead;
-    volatile uint32_t BdSts;
-    // Poll BD status to avoid accessing PCIe address space
-    do {
-      BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
-    } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
-    // At this point, we know that the transmission is complete, but we haven't accessed the
-    // PCIe address space, yet. The subsequent DMA Controller management can be done in a
-    // separate thread to keep latencies in this thread extremly low. We know that we have
-    // received one BD.
+    pollReadScatterGather(false);
     intrs = 1;
   } else {
     intrs = irqs[s2mmInterrupt].irqController->waitForInterrupt(
@@ -507,7 +517,6 @@ Dma::Completion Dma::readCompleteScatterGather() {
     c.interrupts = 0;
     return c;
   } else {
-    hwLock.unlock();
     c.interrupts = intrs;
   }
   if (!polling) {
@@ -550,7 +559,7 @@ Dma::Completion Dma::readCompleteScatterGather() {
     if ((ret & XAXIDMA_BD_STS_ALL_ERR_MASK) ||
         (!(ret & XAXIDMA_BD_STS_COMPLETE_MASK))) {
       hwLock.unlock();
-      throw RuntimeError("Bd Status register shows error: {}", ret);
+      throw RuntimeError("Read: Bd Status register shows error: {}", ret);
     }
 
     c.bytes += XAxiDma_BdGetActualLength(bd, rxRing->MaxTransferLen);
diff --git a/include/villas/nodes/fpga.hpp b/include/villas/nodes/fpga.hpp
index da9bdebd2..bf6f4a24d 100644
--- a/include/villas/nodes/fpga.hpp
+++ b/include/villas/nodes/fpga.hpp
@@ -9,6 +9,7 @@
 
 #pragma once
 
+#include <thread>
 #include <villas/format.hpp>
 #include <villas/node.hpp>
 #include <villas/node/config.hpp>
@@ -31,17 +32,40 @@ protected:
   std::string cardName;
   std::list<std::string> connectStrings;
 
+  // This setting decouples DMA management from Data processing.
+  // With this setting set to true, the DMA management for both read and
+  // write transactions is performed after the write command has been send
+  // the DMA controller.
+  // This allows us to achieve very low latencies for an application that
+  // waits for data from the FPGA processes it, and finished a time step
+  // by issuing a write to the FPGA.
+  bool lowLatencyMode;
+  // This setting performs synchronization with DMA controller in separate
+  // threads. It requires lowLatencyMode to be set to true.
+  // This may improve latency, because DMA management is completely decoupled
+  // from the data path, or may increase latency because of additional thread
+  // synchronization overhead. Only use after verifying that it improves latency.
+  bool asyncDmaManagement;
+
   // State
   std::shared_ptr<fpga::Card> card;
   std::shared_ptr<villas::fpga::ip::Dma> dma;
-  std::shared_ptr<villas::MemoryBlock> blockRx[2];
+  std::shared_ptr<villas::MemoryBlock> blockRx;
   std::shared_ptr<villas::MemoryBlock> blockTx;
 
   // Non-public methods
+  virtual int asyncRead(Sample *smps[], unsigned cnt);
+  virtual int slowRead(Sample *smps[], unsigned cnt);
   virtual int _read(Sample *smps[], unsigned cnt) override;
-
   virtual int _write(Sample *smps[], unsigned cnt) override;
 
+  // only used if asyncDmaManagement is true
+  volatile std::atomic_bool readActive;
+  volatile std::atomic_bool writeActive;
+  volatile std::atomic_bool stopThreads;
+  std::shared_ptr<std::thread> dmaThread;
+  virtual int dmaMgmtThread();
+
 public:
   FpgaNode(const uuid_t &id = {}, const std::string &name = "");
 
@@ -55,6 +79,8 @@ public:
 
   virtual int start() override;
 
+  virtual int stop() override;
+
   virtual std::vector<int> getPollFDs() override;
 
   virtual const std::string &getDetails() override;
diff --git a/lib/nodes/fpga.cpp b/lib/nodes/fpga.cpp
index 8d0cd0975..849ab7bc8 100644
--- a/lib/nodes/fpga.cpp
+++ b/lib/nodes/fpga.cpp
@@ -7,6 +7,7 @@
 
 #include <memory>
 #include <string>
+#include <unistd.h>
 #include <vector>
 
 #include <jansson.h>
@@ -32,8 +33,9 @@ static std::list<std::shared_ptr<fpga::Card>> cards;
 static std::shared_ptr<kernel::vfio::Container> vfioContainer;
 
 FpgaNode::FpgaNode(const uuid_t &id, const std::string &name)
-    : Node(id, name), cardName(""), card(nullptr), dma(), blockRx(), blockTx() {
-}
+    : Node(id, name), cardName(""), connectStrings(), card(nullptr), dma(),
+      blockRx(), blockTx(), readActive(false), writeActive(false),
+      stopThreads(false), dmaThread() {}
 
 FpgaNode::~FpgaNode() {}
 
@@ -75,14 +77,12 @@ int FpgaNode::prepare() {
 
   auto &alloc = HostRam::getAllocator();
 
-  blockRx[0] = alloc.allocateBlock(0x200 * sizeof(float));
-  blockRx[1] = alloc.allocateBlock(0x200 * sizeof(float));
+  blockRx = alloc.allocateBlock(0x200 * sizeof(float));
   blockTx = alloc.allocateBlock(0x200 * sizeof(float));
-  villas::MemoryAccessor<float> memRx[] = {*(blockRx[0]), *(blockRx[1])};
+  villas::MemoryAccessor<float> memRx = *blockRx;
   villas::MemoryAccessor<float> memTx = *blockTx;
 
-  dma->makeAccesibleFromVA(blockRx[0]);
-  dma->makeAccesibleFromVA(blockRx[1]);
+  dma->makeAccesibleFromVA(blockRx);
   dma->makeAccesibleFromVA(blockTx);
 
   MemoryManager::get().printGraph();
@@ -90,6 +90,16 @@ int FpgaNode::prepare() {
   return Node::prepare();
 }
 
+int FpgaNode::stop() {
+  if (asyncDmaManagement) {
+    stopThreads = true;
+    if (dmaThread) {
+      dmaThread->join();
+    }
+  }
+  return Node::stop();
+}
+
 int FpgaNode::parse(json_t *json) {
   int ret = Node::parse(json);
   if (ret) {
@@ -105,8 +115,9 @@ int FpgaNode::parse(json_t *json) {
     vfioContainer = std::make_shared<kernel::vfio::Container>();
   }
 
-  ret = json_unpack_ex(json, &err, 0, "{ s: o, s?: o}", "card", &jsonCard,
-                       "connect", &jsonConnectStrings);
+  ret = json_unpack_ex(json, &err, 0, "{ s: o, s?: o, s?: b}", "card",
+                       &jsonCard, "connect", &jsonConnectStrings,
+                       "asyncDmaManagement", &asyncDmaManagement);
   if (ret) {
     throw ConfigError(json, err, "node-config-fpga",
                       "Failed to parse configuration of node {}",
@@ -167,20 +178,49 @@ const std::string &FpgaNode::getDetails() {
 int FpgaNode::check() { return 0; }
 
 int FpgaNode::start() {
-  // enque first read
-  // dma->read(*(blockRx[0]), blockRx[0]->getSize());
+  if (asyncDmaManagement) {
+    dmaThread = std::make_shared<std::thread>(&FpgaNode::dmaMgmtThread, this);
+  }
+  dma->read(*blockRx, blockRx->getSize());
   return Node::start();
 }
 
+int FpgaNode::dmaMgmtThread() {
+  while (readActive) {
+    usleep(1);
+  }
+  while (!stopThreads) {
+    // readActive must be true, writeActive must be false
+    dma->read(*blockRx, blockRx->getSize());
+    readActive = true;
+    while (readActive && !stopThreads) {
+    }
+    while (!writeActive && !stopThreads) {
+    }
+    // readActive must be false, writeActive must be true
+    dma->writeComplete();
+    writeActive = false;
+  }
+
+  return 0;
+}
+
 int FpgaNode::_read(Sample *smps[], unsigned cnt) {
-  static size_t cur = 0, next = 0;
   unsigned read;
   Sample *smp = smps[0];
 
   assert(cnt == 1);
 
-  dma->read(*(blockRx[next]), blockRx[next]->getSize()); // TODO: calc size
+  if (asyncDmaManagement) {
+    while (!readActive.load(std::memory_order_relaxed) && !stopThreads)
+      ;
+  } else {
+    // dma->read(*blockRx, blockRx->getSize());
+  }
   auto c = dma->readComplete();
+  if (asyncDmaManagement) {
+    readActive.store(false, std::memory_order_relaxed);
+  }
 
   read = c.bytes / sizeof(float);
 
@@ -188,7 +228,7 @@ int FpgaNode::_read(Sample *smps[], unsigned cnt) {
     logger->warn("Missed {} interrupts", c.interrupts - 1);
   }
 
-  auto mem = MemoryAccessor<float>(*(blockRx[cur]));
+  auto mem = MemoryAccessor<float>(*blockRx);
 
   smp->length = 0;
   for (unsigned i = 0; i < MIN(read, smp->capacity); i++) {
@@ -198,18 +238,20 @@ int FpgaNode::_read(Sample *smps[], unsigned cnt) {
   smp->flags = (int)SampleFlags::HAS_DATA;
 
   smp->signals = in.signals;
-  //cur = next;
-  //next = (next + 1) % (sizeof(blockRx) / sizeof(blockRx[0]));
 
   return 1;
 }
 
 int FpgaNode::_write(Sample *smps[], unsigned cnt) {
-  unsigned int written;
+  // unsigned int written;
   Sample *smp = smps[0];
 
   assert(cnt == 1 && smps != nullptr && smps[0] != nullptr);
 
+  if (asyncDmaManagement) {
+    while (writeActive.load(std::memory_order_relaxed) && !stopThreads)
+      ;
+  }
   auto mem = MemoryAccessor<uint32_t>(*blockTx);
   float scaled;
 
@@ -224,15 +266,20 @@ int FpgaNode::_write(Sample *smps[], unsigned cnt) {
   }
 
   bool state = dma->write(*blockTx, smp->length * sizeof(float));
-  if (!state)
+  if (!state) {
     return -1;
+  }
+  if (asyncDmaManagement) {
+    writeActive.store(true, std::memory_order_relaxed);
+  } else {
+    auto written = dma->writeComplete().bytes /
+                   sizeof(float); // The number of samples written
 
-  written = dma->writeComplete().bytes /
-            sizeof(float); // The number of samples written
-
-  if (written != smp->length) {
-    logger->warn("Wrote {} samples, but {} were expected", written,
-                 smp->length);
+    if (written != smp->length) {
+      logger->warn("Wrote {} samples, but {} were expected", written,
+                   smp->length);
+    }
+    dma->read(*blockRx, blockRx->getSize());
   }
 
   return 1;

From 2cc8cad115605ed3a5db85b1e2398c8c1606a222 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Sun, 24 Mar 2024 13:17:34 +0100
Subject: [PATCH 04/10] fpga: expose methods for finer control over DMA data
 path

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 fpga/include/villas/fpga/ips/dma.hpp |  11 +-
 fpga/lib/ips/dma.cpp                 | 182 +++++++++++++++++++++++----
 2 files changed, 169 insertions(+), 24 deletions(-)

diff --git a/fpga/include/villas/fpga/ips/dma.hpp b/fpga/include/villas/fpga/ips/dma.hpp
index 10d5108b7..e35f7b309 100644
--- a/fpga/include/villas/fpga/ips/dma.hpp
+++ b/fpga/include/villas/fpga/ips/dma.hpp
@@ -49,7 +49,14 @@ public:
                               : writeCompleteSimple();
   }
 
-  size_t pollReadScatterGather(bool lock);
+  bool readScatterGatherPrepare(const MemoryBlock &mem, size_t len);
+  bool readScatterGatherFast();
+  size_t readScatterGatherPoll(bool lock = true);
+
+  bool writeScatterGatherPrepare(const MemoryBlock &mem, size_t len);
+  bool writeScatterGatherFast();
+  size_t writeScatterGatherPoll(bool lock = true);
+
   Completion readComplete() {
     return hasScatterGather() ? readCompleteScatterGather()
                               : readCompleteSimple();
@@ -81,7 +88,9 @@ public:
 private:
   bool writeScatterGather(const void *buf, size_t len);
   bool readScatterGather(void *buf, size_t len);
+  XAxiDma_Bd *writeScatterGatherSetupBd(const void *buf, size_t len);
   Completion writeCompleteScatterGather();
+  XAxiDma_Bd *readScatterGatherSetupBd(void *buf, size_t len);
   Completion readCompleteScatterGather();
 
   bool writeSimple(const void *buf, size_t len);
diff --git a/fpga/lib/ips/dma.cpp b/fpga/lib/ips/dma.cpp
index cf973bd21..24e6d627a 100644
--- a/fpga/lib/ips/dma.cpp
+++ b/fpga/lib/ips/dma.cpp
@@ -284,12 +284,58 @@ bool Dma::read(const MemoryBlock &mem, size_t len) {
                             : readSimple(buf, len);
 }
 
-//Write a single message
-bool Dma::writeScatterGather(const void *buf, size_t len) {
-  // buf is address from view of DMA controller
-
-  int ret = XST_FAILURE;
+// Reuse existing single BD bypassing BdRingFree, Alloc, ToHw
+bool Dma::writeScatterGatherFast() {
   hwLock.lock();
+  auto *txRing = XAxiDma_GetTxRing(&xDma);
+  if (txRing == nullptr) {
+    hwLock.unlock();
+    throw RuntimeError("RxRing was null.");
+  }
+  XAxiDma_Bd *CurBdPtr = txRing->HwHead;
+
+  // Clear the bit we are polling on in complete
+  uint32_t BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
+  BdSts &= ~XAXIDMA_BD_STS_COMPLETE_MASK;
+  XAxiDma_BdWrite(CurBdPtr, XAXIDMA_BD_STS_OFFSET, BdSts);
+
+  uintptr_t tdesc = ((uintptr_t)txRing->HwTail +
+                     (txRing->FirstBdPhysAddr - txRing->FirstBdAddr)) &
+                    XAXIDMA_DESC_LSB_MASK;
+  XAxiDma_WriteReg(txRing->ChanBase, XAXIDMA_TDESC_OFFSET, tdesc);
+
+  hwLock.unlock();
+  return true;
+}
+
+bool Dma::writeScatterGatherPrepare(const MemoryBlock &mem, size_t len) {
+
+  auto &mm = MemoryManager::get();
+
+  // User has to make sure that memory is accessible, otherwise this will throw
+  auto trans = mm.getTranslation(busMasterInterfaces[mm2sInterface],
+                                 mem.getAddrSpaceId());
+  void *buf = reinterpret_cast<void *>(trans.getLocalAddr(0));
+  if (buf == nullptr) {
+    throw RuntimeError("Buffer was null");
+  }
+  hwLock.lock();
+
+  auto bd = writeScatterGatherSetupBd(buf, len);
+
+  hwLock.unlock();
+
+  return bd != nullptr;
+}
+
+XAxiDma_Bd *Dma::writeScatterGatherSetupBd(const void *buf, size_t len) {
+  uint32_t ret = XST_FAILURE;
+  if (len == 0)
+    return nullptr;
+
+  if (len > FPGA_DMA_BOUNDARY)
+    return nullptr;
+
   auto *txRing = XAxiDma_GetTxRing(&xDma);
   if (txRing == nullptr) {
     hwLock.unlock();
@@ -321,7 +367,22 @@ bool Dma::writeScatterGather(const void *buf, size_t len) {
 
   // TODO: Check if we really need this
   XAxiDma_BdSetId(bd, (uintptr_t)buf);
+  return bd;
+}
 
+//Write a single message
+bool Dma::writeScatterGather(const void *buf, size_t len) {
+  // buf is address from view of DMA controller
+
+  int ret = XST_FAILURE;
+  hwLock.lock();
+  auto *txRing = XAxiDma_GetTxRing(&xDma);
+  if (txRing == nullptr) {
+    hwLock.unlock();
+    throw RuntimeError("TxRing was null.");
+  }
+
+  XAxiDma_Bd *bd = writeScatterGatherSetupBd(buf, len);
   // Give control of BD to HW. We should not access it until transfer is finished.
   // Failure could also indicate that EOF is not set on last Bd
   ret = XAxiDma_BdRingToHw(txRing, 1, bd);
@@ -335,20 +396,64 @@ bool Dma::writeScatterGather(const void *buf, size_t len) {
   return true;
 }
 
-bool Dma::readScatterGather(void *buf, size_t len) {
-  uint32_t ret = XST_FAILURE;
-
-  if (len < readCoalesce * readMsgSize) {
-    throw RuntimeError(
-        "Read size is smaller than readCoalesce*msgSize. Cannot setup BDs.");
-  }
-
+// Reuse existing single BD bypassing BdRingFree, Alloc, ToHw
+bool Dma::readScatterGatherFast() {
   hwLock.lock();
   auto *rxRing = XAxiDma_GetRxRing(&xDma);
   if (rxRing == nullptr) {
     hwLock.unlock();
     throw RuntimeError("RxRing was null.");
   }
+  XAxiDma_Bd *CurBdPtr = rxRing->HwHead;
+  // Poll BD status to avoid accessing PCIe address space
+  uint32_t BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
+
+  // Clear the bit we are polling on in complete
+  BdSts &= ~XAXIDMA_BD_STS_COMPLETE_MASK;
+  XAxiDma_BdWrite(CurBdPtr, XAXIDMA_BD_STS_OFFSET, BdSts);
+
+  uintptr_t tdesc = ((uintptr_t)rxRing->HwTail +
+                     (rxRing->FirstBdPhysAddr - rxRing->FirstBdAddr)) &
+                    XAXIDMA_DESC_LSB_MASK;
+  XAxiDma_WriteReg(rxRing->ChanBase, XAXIDMA_TDESC_OFFSET, tdesc);
+
+  hwLock.unlock();
+  return true;
+}
+
+bool Dma::readScatterGatherPrepare(const MemoryBlock &mem, size_t len) {
+
+  auto &mm = MemoryManager::get();
+
+  // User has to make sure that memory is accessible, otherwise this will throw
+  auto trans = mm.getTranslation(busMasterInterfaces[s2mmInterface],
+                                 mem.getAddrSpaceId());
+  void *buf = reinterpret_cast<void *>(trans.getLocalAddr(0));
+  if (buf == nullptr) {
+    throw RuntimeError("Buffer was null");
+  }
+  hwLock.lock();
+
+  auto bd = readScatterGatherSetupBd(buf, len);
+
+  hwLock.unlock();
+
+  return bd != nullptr;
+}
+
+XAxiDma_Bd *Dma::readScatterGatherSetupBd(void *buf, size_t len) {
+  uint32_t ret = XST_FAILURE;
+  if (len == 0)
+    return nullptr;
+
+  if (len > FPGA_DMA_BOUNDARY)
+    return nullptr;
+
+  auto *rxRing = XAxiDma_GetRxRing(&xDma);
+  if (rxRing == nullptr) {
+    hwLock.unlock();
+    throw RuntimeError("RxRing was null.");
+  }
 
   XAxiDma_Bd *bd;
   ret = XAxiDma_BdRingAlloc(rxRing, readCoalesce, &bd);
@@ -384,6 +489,25 @@ bool Dma::readScatterGather(void *buf, size_t len) {
     curBuf += readMsgSize;
     curBd = (XAxiDma_Bd *)XAxiDma_BdRingNext(rxRing, curBd);
   }
+  return bd;
+}
+
+bool Dma::readScatterGather(void *buf, size_t len) {
+  uint32_t ret = XST_FAILURE;
+
+  if (len < readCoalesce * readMsgSize) {
+    throw RuntimeError(
+        "Read size is smaller than readCoalesce*msgSize. Cannot setup BDs.");
+  }
+
+  hwLock.lock();
+  auto *rxRing = XAxiDma_GetRxRing(&xDma);
+  if (rxRing == nullptr) {
+    hwLock.unlock();
+    throw RuntimeError("RxRing was null.");
+  }
+
+  XAxiDma_Bd *bd = readScatterGatherSetupBd(buf, len);
 
   ret = XAxiDma_BdRingToHw(rxRing, readCoalesce, bd);
   if (ret != XST_SUCCESS) {
@@ -395,6 +519,25 @@ bool Dma::readScatterGather(void *buf, size_t len) {
   return true;
 }
 
+size_t Dma::writeScatterGatherPoll(bool lock) {
+  if (lock) {
+    hwLock.lock();
+  }
+  auto txRing = XAxiDma_GetTxRing(&xDma);
+  XAxiDma_Bd *CurBdPtr = txRing->HwHead;
+  volatile uint32_t BdSts;
+  // Poll BD status to avoid accessing PCIe address space
+  do {
+    BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
+  } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
+  // At this point, we know that the transmission is complete, but we haven't accessed the
+  // PCIe address space, yet. We know that we have received at least one BD.
+  if (lock) {
+    hwLock.unlock();
+  }
+  return XAxiDma_BdGetActualLength(CurBdPtr, XAXIDMA_MCHAN_MAX_TRANSFER_LEN);
+}
+
 Dma::Completion Dma::writeCompleteScatterGather() {
   Completion c;
   XAxiDma_Bd *bd = nullptr, *curBd;
@@ -405,14 +548,7 @@ Dma::Completion Dma::writeCompleteScatterGather() {
   uint32_t irqStatus = 0;
   if (polling) {
     hwLock.lock();
-    XAxiDma_Bd *CurBdPtr = txRing->HwHead;
-    volatile uint32_t BdSts;
-    // Poll BD status to avoid accessing PCIe address space
-    do {
-      BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
-    } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
-    // At this point, we know that the transmission is complete, but we haven't accessed the
-    // PCIe address space, yet. We know that we have received at least one BD.
+    writeScatterGatherPoll(false);
   } else {
     c.interrupts = irqs[mm2sInterrupt].irqController->waitForInterrupt(
         irqs[mm2sInterrupt].num);
@@ -468,7 +604,7 @@ Dma::Completion Dma::writeCompleteScatterGather() {
   return c;
 }
 
-size_t Dma::pollReadScatterGather(bool lock) {
+size_t Dma::readScatterGatherPoll(bool lock) {
   if (lock) {
     hwLock.lock();
   }
@@ -498,7 +634,7 @@ Dma::Completion Dma::readCompleteScatterGather() {
   uint32_t irqStatus = 0;
   if (polling) {
     hwLock.lock();
-    pollReadScatterGather(false);
+    readScatterGatherPoll(false);
     intrs = 1;
   } else {
     intrs = irqs[s2mmInterrupt].irqController->waitForInterrupt(

From f498518236ccfaf277b773293953e908ea519a01 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Mon, 25 Mar 2024 09:12:21 +0100
Subject: [PATCH 05/10] fpga: update ips json in fpga.conf

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 etc/examples/nodes/fpga.conf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etc/examples/nodes/fpga.conf b/etc/examples/nodes/fpga.conf
index e768cc183..697e2a8df 100644
--- a/etc/examples/nodes/fpga.conf
+++ b/etc/examples/nodes/fpga.conf
@@ -11,7 +11,7 @@ fpgas = {
         id = "10ee:7021",
         slot = "0000:88:00.0",
         do_reset = true,
-        ips = "../../../fpga/etc/vc707-xbar-pcie/vc707-xbar-pcie.json",
+        ips = "../../fpga/etc/vc707-xbar-pcie/vc707-xbar-pcie-dino-v2.json",
         polling =  false,
     }
 }

From 9cf926d84e7059bd4944cb0d22ad07f6f8311d41 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Mon, 25 Mar 2024 09:14:08 +0100
Subject: [PATCH 06/10] fpga: add lowLatencyMode setting

This setting improves latency by remove various checks.
Use with caution! Requires read cache in FPGA design!
The common use case in VILLASfpga is that we have exactly
one write for every read and the number of exchanged signals
do not change. If this is the case, we can reuse the buffer
descriptors during reads and write, thus avoidng freeing,
reallocating and setting them up.
We set up the descriptors in start, and in write or read,
we only reset the complete bit in the buffer descriptor and
write to the tdesc register to start the DMA transfer.
Improves read/write latency by approx. 40%.

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 include/villas/nodes/fpga.hpp |  35 ++++----
 lib/nodes/fpga.cpp            | 150 +++++++++++++++++++++-------------
 2 files changed, 107 insertions(+), 78 deletions(-)

diff --git a/include/villas/nodes/fpga.hpp b/include/villas/nodes/fpga.hpp
index bf6f4a24d..c292f4e56 100644
--- a/include/villas/nodes/fpga.hpp
+++ b/include/villas/nodes/fpga.hpp
@@ -32,20 +32,18 @@ protected:
   std::string cardName;
   std::list<std::string> connectStrings;
 
-  // This setting decouples DMA management from Data processing.
-  // With this setting set to true, the DMA management for both read and
-  // write transactions is performed after the write command has been send
-  // the DMA controller.
-  // This allows us to achieve very low latencies for an application that
-  // waits for data from the FPGA processes it, and finished a time step
-  // by issuing a write to the FPGA.
+  // This setting improves latency by remove various checks.
+  // Use with caution! Requires read cache in FPGA design!
+  // The common use case in VILLASfpga is that we have exactly
+  // one write for every read and the number of exchanged signals
+  // do not change. If this is the case, we can reuse the buffer
+  // descriptors during reads and write, thus avoidng freeing,
+  // reallocating and setting them up.
+  // We set up the descriptors in start, and in write or read,
+  // we only reset the complete bit in the buffer descriptor and
+  // write to the tdesc register to start the DMA transfer.
+  // Improves read/write latency by approx. 40%.
   bool lowLatencyMode;
-  // This setting performs synchronization with DMA controller in separate
-  // threads. It requires lowLatencyMode to be set to true.
-  // This may improve latency, because DMA management is completely decoupled
-  // from the data path, or may increase latency because of additional thread
-  // synchronization overhead. Only use after verifying that it improves latency.
-  bool asyncDmaManagement;
 
   // State
   std::shared_ptr<fpga::Card> card;
@@ -54,18 +52,13 @@ protected:
   std::shared_ptr<villas::MemoryBlock> blockTx;
 
   // Non-public methods
-  virtual int asyncRead(Sample *smps[], unsigned cnt);
+  virtual int fastRead(Sample *smps[], unsigned cnt);
   virtual int slowRead(Sample *smps[], unsigned cnt);
   virtual int _read(Sample *smps[], unsigned cnt) override;
+  virtual int fastWrite(Sample *smps[], unsigned cnt);
+  virtual int slowWrite(Sample *smps[], unsigned cnt);
   virtual int _write(Sample *smps[], unsigned cnt) override;
 
-  // only used if asyncDmaManagement is true
-  volatile std::atomic_bool readActive;
-  volatile std::atomic_bool writeActive;
-  volatile std::atomic_bool stopThreads;
-  std::shared_ptr<std::thread> dmaThread;
-  virtual int dmaMgmtThread();
-
 public:
   FpgaNode(const uuid_t &id = {}, const std::string &name = "");
 
diff --git a/lib/nodes/fpga.cpp b/lib/nodes/fpga.cpp
index 849ab7bc8..c2be73f08 100644
--- a/lib/nodes/fpga.cpp
+++ b/lib/nodes/fpga.cpp
@@ -33,9 +33,8 @@ static std::list<std::shared_ptr<fpga::Card>> cards;
 static std::shared_ptr<kernel::vfio::Container> vfioContainer;
 
 FpgaNode::FpgaNode(const uuid_t &id, const std::string &name)
-    : Node(id, name), cardName(""), connectStrings(), card(nullptr), dma(),
-      blockRx(), blockTx(), readActive(false), writeActive(false),
-      stopThreads(false), dmaThread() {}
+    : Node(id, name), cardName(""), connectStrings(), lowLatencyMode(false),
+      card(nullptr), dma(), blockRx(), blockTx() {}
 
 FpgaNode::~FpgaNode() {}
 
@@ -90,15 +89,7 @@ int FpgaNode::prepare() {
   return Node::prepare();
 }
 
-int FpgaNode::stop() {
-  if (asyncDmaManagement) {
-    stopThreads = true;
-    if (dmaThread) {
-      dmaThread->join();
-    }
-  }
-  return Node::stop();
-}
+int FpgaNode::stop() { return Node::stop(); }
 
 int FpgaNode::parse(json_t *json) {
   int ret = Node::parse(json);
@@ -115,9 +106,9 @@ int FpgaNode::parse(json_t *json) {
     vfioContainer = std::make_shared<kernel::vfio::Container>();
   }
 
-  ret = json_unpack_ex(json, &err, 0, "{ s: o, s?: o, s?: b}", "card",
+  ret = json_unpack_ex(json, &err, 0, "{ s: o, s?: o, s?: b, s?: b}", "card",
                        &jsonCard, "connect", &jsonConnectStrings,
-                       "asyncDmaManagement", &asyncDmaManagement);
+                       "lowLatencyMode", &lowLatencyMode);
   if (ret) {
     throw ConfigError(json, err, "node-config-fpga",
                       "Failed to parse configuration of node {}",
@@ -169,7 +160,8 @@ const std::string &FpgaNode::getDetails() {
     std::copy(connectStrings.begin(), connectStrings.end(),
               std::ostream_iterator<std::string>(imploded, delim));
 
-    details = fmt::format("fpga={}, connect={}", name, imploded.str());
+    details = fmt::format("fpga={}, connect={}, lowLatencyMode={}", name,
+                          imploded.str(), lowLatencyMode);
   }
 
   return details;
@@ -178,49 +170,94 @@ const std::string &FpgaNode::getDetails() {
 int FpgaNode::check() { return 0; }
 
 int FpgaNode::start() {
-  if (asyncDmaManagement) {
-    dmaThread = std::make_shared<std::thread>(&FpgaNode::dmaMgmtThread, this);
+  if (getInputSignalsMaxCount() * sizeof(float) > blockRx->getSize()) {
+    logger->error("Input signals exceed block size.");
+    throw villas ::RuntimeError("Input signals exceed block size.");
   }
-  dma->read(*blockRx, blockRx->getSize());
+  if (lowLatencyMode) {
+    dma->readScatterGatherPrepare(*blockRx, blockRx->getSize());
+    if (getInputSignalsMaxCount() != 0) {
+      dma->writeScatterGatherPrepare(*blockTx,
+                                     getInputSignalsMaxCount() * sizeof(float));
+    } else {
+      logger->warn("No input signals defined. Not preparing write buffer - "
+                   "writes will not work.");
+    }
+  }
+
   return Node::start();
 }
 
-int FpgaNode::dmaMgmtThread() {
-  while (readActive) {
-    usleep(1);
-  }
-  while (!stopThreads) {
-    // readActive must be true, writeActive must be false
-    dma->read(*blockRx, blockRx->getSize());
-    readActive = true;
-    while (readActive && !stopThreads) {
-    }
-    while (!writeActive && !stopThreads) {
-    }
-    // readActive must be false, writeActive must be true
-    dma->writeComplete();
-    writeActive = false;
+int FpgaNode::fastWrite(Sample *smps[], unsigned cnt) {
+  Sample *smp = smps[0];
+
+  assert(cnt == 1 && smps != nullptr && smps[0] != nullptr);
+
+  auto mem = MemoryAccessor<uint32_t>(*blockTx);
+  float scaled;
+
+  for (unsigned i = 0; i < smp->length; i++) {
+      if (smp->signals->getByIndex(i)->type == SignalType::FLOAT) {
+      scaled = smp->data[i].f;
+      if (scaled > 10.) {
+        scaled = 10.;
+      } else if (scaled < -10.) {
+        scaled = -10.;
+      }
+      mem[i] = (scaled + 10.) * ((float)0xFFFF / 20.);
+      } else {
+      mem[i] = smp->data[i].i;
+      }
   }
 
-  return 0;
+  dma->writeScatterGatherFast();
+  auto written = dma->writeScatterGatherPoll() /
+                 sizeof(float); // The number of samples written
+
+  if (written != smp->length) {
+    logger->warn("Wrote {} samples, but {} were expected", written,
+                 smp->length);
+  }
+
+  return 1;
+}
+
+int FpgaNode::fastRead(Sample *smps[], unsigned cnt) {
+  Sample *smp = smps[0];
+  auto mem = MemoryAccessor<float>(*blockRx);
+
+  smp->flags = (int)SampleFlags::HAS_DATA;
+  smp->signals = in.signals;
+
+  dma->readScatterGatherFast();
+  auto read = dma->readScatterGatherPoll(true);
+  // We assume a lot without checking at this point. All for the latency!
+
+  smp->length = 0;
+  for (unsigned i = 0; i < MIN(read / sizeof(float), smp->capacity); i++) {
+    smp->data[i].f = static_cast<double>(mem[i]);
+    smp->length++;
+  }
+
+  return 1;
 }
 
 int FpgaNode::_read(Sample *smps[], unsigned cnt) {
+  if (lowLatencyMode) {
+    return fastRead(smps, cnt);
+  } else {
+    return slowRead(smps, cnt);
+  }
+}
+
+int FpgaNode::slowRead(Sample *smps[], unsigned cnt) {
   unsigned read;
   Sample *smp = smps[0];
 
   assert(cnt == 1);
 
-  if (asyncDmaManagement) {
-    while (!readActive.load(std::memory_order_relaxed) && !stopThreads)
-      ;
-  } else {
-    // dma->read(*blockRx, blockRx->getSize());
-  }
+  dma->read(*blockRx, blockRx->getSize());
   auto c = dma->readComplete();
-  if (asyncDmaManagement) {
-    readActive.store(false, std::memory_order_relaxed);
-  }
 
   read = c.bytes / sizeof(float);
 
@@ -243,15 +280,19 @@ int FpgaNode::_read(Sample *smps[], unsigned cnt) {
 }
 
 int FpgaNode::_write(Sample *smps[], unsigned cnt) {
+  if (lowLatencyMode) {
+    return fastWrite(smps, cnt);
+  } else {
+    return slowWrite(smps, cnt);
+  }
+}
+
+int FpgaNode::slowWrite(Sample *smps[], unsigned cnt) {
   // unsigned int written;
   Sample *smp = smps[0];
 
   assert(cnt == 1 && smps != nullptr && smps[0] != nullptr);
 
-  if (asyncDmaManagement) {
-    while (writeActive.load(std::memory_order_relaxed) && !stopThreads)
-      ;
-  }
   auto mem = MemoryAccessor<uint32_t>(*blockTx);
   float scaled;
 
@@ -269,17 +310,12 @@ int FpgaNode::_write(Sample *smps[], unsigned cnt) {
   if (!state) {
     return -1;
   }
-  if (asyncDmaManagement) {
-    writeActive.store(true, std::memory_order_relaxed);
-  } else {
-    auto written = dma->writeComplete().bytes /
-                   sizeof(float); // The number of samples written
+  auto written = dma->writeComplete().bytes /
+                 sizeof(float); // The number of samples written
 
-    if (written != smp->length) {
-      logger->warn("Wrote {} samples, but {} were expected", written,
-                   smp->length);
-    }
-    dma->read(*blockRx, blockRx->getSize());
+  if (written != smp->length) {
+    logger->warn("Wrote {} samples, but {} were expected", written,
+                 smp->length);
   }
 
   return 1;

From a2ff0aca43a752f5e365f5b66b7eb8de4836e6d7 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Mon, 25 Mar 2024 09:36:21 +0100
Subject: [PATCH 07/10] fix formatting in fpga

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 lib/nodes/fpga.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/nodes/fpga.cpp b/lib/nodes/fpga.cpp
index c2be73f08..4781f32f7 100644
--- a/lib/nodes/fpga.cpp
+++ b/lib/nodes/fpga.cpp
@@ -197,7 +197,7 @@ int FpgaNode::fastWrite(Sample *smps[], unsigned cnt) {
   float scaled;
 
   for (unsigned i = 0; i < smp->length; i++) {
-      if (smp->signals->getByIndex(i)->type == SignalType::FLOAT) {
+    if (smp->signals->getByIndex(i)->type == SignalType::FLOAT) {
       scaled = smp->data[i].f;
       if (scaled > 10.) {
         scaled = 10.;
@@ -205,9 +205,9 @@ int FpgaNode::fastWrite(Sample *smps[], unsigned cnt) {
         scaled = -10.;
       }
       mem[i] = (scaled + 10.) * ((float)0xFFFF / 20.);
-      } else {
+    } else {
       mem[i] = smp->data[i].i;
-      }
+    }
   }
 
   dma->writeScatterGatherFast();

From f1776f8be4a45e371d07680ddc83dd849cebb00c Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 18 Apr 2024 10:32:00 +0200
Subject: [PATCH 08/10] fpga: improve comments for fastRead and fastWrite

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 fpga/include/villas/fpga/ips/dma.hpp | 6 ++++--
 fpga/lib/ips/dma.cpp                 | 2 +-
 lib/nodes/fpga.cpp                   | 6 ++++++
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/fpga/include/villas/fpga/ips/dma.hpp b/fpga/include/villas/fpga/ips/dma.hpp
index e35f7b309..f4175b0df 100644
--- a/fpga/include/villas/fpga/ips/dma.hpp
+++ b/fpga/include/villas/fpga/ips/dma.hpp
@@ -139,11 +139,13 @@ private:
 
   // When using SG: ringBdSize is the maximum number of BDs usable in the ring
   // Depending on alignment, the actual number of BDs usable can be smaller
+  // We use a single BD for transfers, because this way we can achieve the best
+  // latency. The AXI read cache in the FPGA also only supports a single BD.
+  // TODO: We could make this configurable in the future.
   static constexpr size_t requestedRingBdSize = 1;
   static constexpr size_t requestedRingBdSizeMemory =
       requestedRingBdSize * sizeof(XAxiDma_Bd);
-  uint32_t actualRingBdSize = 1; //XAxiDma_BdRingCntCalc(
-      //XAXIDMA_BD_MINIMUM_ALIGNMENT, requestedRingBdSizeMemory);
+  uint32_t actualRingBdSize = 1;
   std::shared_ptr<MemoryBlock> sgRing;
 };
 
diff --git a/fpga/lib/ips/dma.cpp b/fpga/lib/ips/dma.cpp
index 24e6d627a..1ba98d108 100644
--- a/fpga/lib/ips/dma.cpp
+++ b/fpga/lib/ips/dma.cpp
@@ -370,7 +370,7 @@ XAxiDma_Bd *Dma::writeScatterGatherSetupBd(const void *buf, size_t len) {
   return bd;
 }
 
-//Write a single message
+// Write a single message
 bool Dma::writeScatterGather(const void *buf, size_t len) {
   // buf is address from view of DMA controller
 
diff --git a/lib/nodes/fpga.cpp b/lib/nodes/fpga.cpp
index 4781f32f7..c365436e9 100644
--- a/lib/nodes/fpga.cpp
+++ b/lib/nodes/fpga.cpp
@@ -188,6 +188,9 @@ int FpgaNode::start() {
   return Node::start();
 }
 
+// We cannot modify the BD here, so writes are fixed length.
+// If fastWrite receives less signals than expected, the previous data
+// will be reused for the remaining signals
 int FpgaNode::fastWrite(Sample *smps[], unsigned cnt) {
   Sample *smp = smps[0];
 
@@ -222,6 +225,9 @@ int FpgaNode::fastWrite(Sample *smps[], unsigned cnt) {
   return 1;
 }
 
+// Because we cannot modify the BD here, reads are fixed length.
+// However, if we receive less data than expected, we will return only
+// what we have received. fastRead is thus capable of partial reads.
 int FpgaNode::fastRead(Sample *smps[], unsigned cnt) {
   Sample *smp = smps[0];
   auto mem = MemoryAccessor<float>(*blockRx);

From c151be5ccaa9730b653763428b3f31398006e02c Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 18 Apr 2024 14:32:52 +0200
Subject: [PATCH 09/10] fpga: fix includes and various comments

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 fpga/include/villas/fpga/ips/dma.hpp |  2 ++
 fpga/lib/ips/dma.cpp                 | 19 +++++++++----------
 include/villas/nodes/fpga.hpp        |  2 +-
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/fpga/include/villas/fpga/ips/dma.hpp b/fpga/include/villas/fpga/ips/dma.hpp
index f4175b0df..ce0c899da 100644
--- a/fpga/include/villas/fpga/ips/dma.hpp
+++ b/fpga/include/villas/fpga/ips/dma.hpp
@@ -10,10 +10,12 @@
 #pragma once
 
 #include <fmt/ostream.h>
+
 #include <villas/config.hpp>
 #include <villas/exceptions.hpp>
 #include <villas/fpga/node.hpp>
 #include <villas/memory.hpp>
+
 #include <xilinx/xaxidma.h>
 
 namespace villas {
diff --git a/fpga/lib/ips/dma.cpp b/fpga/lib/ips/dma.cpp
index 1ba98d108..872c0ae22 100644
--- a/fpga/lib/ips/dma.cpp
+++ b/fpga/lib/ips/dma.cpp
@@ -1,24 +1,23 @@
 /* DMA driver
  *
- * Author: Daniel Krebs <github@daniel-krebs.net>
  * Author: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
- * SPDX-FileCopyrightText: 2018 Institute for Automation of Complex Power Systems, RWTH Aachen University
+ * Author: Daniel Krebs <github@daniel-krebs.net>
+ * SPDX-FileCopyrightText: 2018-2024 Institute for Automation of Complex Power Systems, RWTH Aachen University
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <sstream>
 #include <string>
-
-#include "xilinx/xaxidma_bd.h"
-#include "xilinx/xaxidma_hw.h"
 #include <sys/types.h>
-#include <xilinx/xaxidma.h>
-
-#include <villas/memory.hpp>
 
 #include <villas/fpga/card.hpp>
 #include <villas/fpga/ips/dma.hpp>
 #include <villas/fpga/ips/intc.hpp>
+#include <villas/memory.hpp>
+
+#include <xilinx/xaxidma.h>
+#include <xilinx/xaxidma_bd.h>
+#include <xilinx/xaxidma_hw.h>
 
 // Max. size of a DMA transfer in simple mode
 #define FPGA_DMA_BOUNDARY 0x1000
@@ -116,7 +115,7 @@ void Dma::setupScatterGatherRingRx(uintptr_t physAddr, uintptr_t virtAddr) {
     throw RuntimeError("Failed to clone BD template: {}", ret);
 
   if (cyclic) {
-    /* Enable Cyclic DMA mode */
+    // Enable Cyclic DMA mode
     XAxiDma_BdRingEnableCyclicDMA(rxRingPtr);
     XAxiDma_SelectCyclicMode(&xDma, XAXIDMA_DEVICE_TO_DMA, 1);
   }
@@ -212,7 +211,7 @@ Dma::~Dma() {
       free(rxRingPtr->CyclicBd);
       rxRingPtr->CyclicBd = nullptr;
     }
-    // unampe SG memory Blocks
+    // Unmap SG memory Blocks
     if (sgRing) {
       card->unmapMemoryBlock(*sgRing);
     }
diff --git a/include/villas/nodes/fpga.hpp b/include/villas/nodes/fpga.hpp
index c292f4e56..a124efca3 100644
--- a/include/villas/nodes/fpga.hpp
+++ b/include/villas/nodes/fpga.hpp
@@ -32,7 +32,7 @@ protected:
   std::string cardName;
   std::list<std::string> connectStrings;
 
-  // This setting improves latency by remove various checks.
+  // This setting improves latency by removing various checks.
   // Use with caution! Requires read cache in FPGA design!
   // The common use case in VILLASfpga is that we have exactly
   // one write for every read and the number of exchanged signals

From ed05671a5169e02c41bb016d327ea92f052bbcf5 Mon Sep 17 00:00:00 2001
From: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
Date: Thu, 18 Apr 2024 14:46:03 +0200
Subject: [PATCH 10/10] fpga: improve comments in register.cpp

Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
---
 fpga/lib/ips/register.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fpga/lib/ips/register.cpp b/fpga/lib/ips/register.cpp
index 83bb134c0..9310a1183 100644
--- a/fpga/lib/ips/register.cpp
+++ b/fpga/lib/ips/register.cpp
@@ -43,9 +43,12 @@ bool Register::check() {
   }
 
   // This is Dino specific for now - we should possibly move this to Dino in the future
-  setRegister(0, static_cast<uint32_t>(1000)); // set Dino to a rate of 20 kHz
-  setRegister(1, -0.001615254F);
-  setRegister(2, 10.8061F);
+  constexpr double dinoClk = 25e9;    // Dino is clocked with 25 Mhz
+  constexpr double sampleRate = 20e6; // We want to achieve a timestep of 50us
+  constexpr uint32_t dinoTimerVal = static_cast<uint32_t>(dinoClk / sampleRate);
+  setRegister(0, dinoTimerVal); // Timer value for generating ADC trigger signal
+  setRegister(1, -0.001615254F); // Scale factor for ADC value
+  setRegister(2, 10.8061F);      // Offset for ADC value
   uint32_t rate = getRegister(0);
   float scale = getRegisterFloat(1);
   float offset = getRegisterFloat(2);