diff --git a/etc/examples/nodes/fpga.conf b/etc/examples/nodes/fpga.conf
index e768cc183..697e2a8df 100644
--- a/etc/examples/nodes/fpga.conf
+++ b/etc/examples/nodes/fpga.conf
@@ -11,7 +11,7 @@ fpgas = {
         id = "10ee:7021",
         slot = "0000:88:00.0",
         do_reset = true,
-        ips = "../../../fpga/etc/vc707-xbar-pcie/vc707-xbar-pcie.json",
+        ips = "../../fpga/etc/vc707-xbar-pcie/vc707-xbar-pcie-dino-v2.json",
         polling =  false,
     }
 }
diff --git a/fpga/include/villas/fpga/ips/dma.hpp b/fpga/include/villas/fpga/ips/dma.hpp
index e29cc093a..ce0c899da 100644
--- a/fpga/include/villas/fpga/ips/dma.hpp
+++ b/fpga/include/villas/fpga/ips/dma.hpp
@@ -10,10 +10,12 @@
 #pragma once
 
 #include <fmt/ostream.h>
+
 #include <villas/config.hpp>
 #include <villas/exceptions.hpp>
 #include <villas/fpga/node.hpp>
 #include <villas/memory.hpp>
+
 #include <xilinx/xaxidma.h>
 
 namespace villas {
@@ -49,6 +51,14 @@ public:
                               : writeCompleteSimple();
   }
 
+  bool readScatterGatherPrepare(const MemoryBlock &mem, size_t len);
+  bool readScatterGatherFast();
+  size_t readScatterGatherPoll(bool lock = true);
+
+  bool writeScatterGatherPrepare(const MemoryBlock &mem, size_t len);
+  bool writeScatterGatherFast();
+  size_t writeScatterGatherPoll(bool lock = true);
+
   Completion readComplete() {
     return hasScatterGather() ? readCompleteScatterGather()
                               : readCompleteSimple();
@@ -61,11 +71,11 @@ public:
 
   inline bool hasScatterGather() const { return xConfig.HasSg; }
 
-  const StreamVertex &getDefaultSlavePort() const {
+  const StreamVertex &getDefaultSlavePort() const override {
     return getSlavePort(s2mmPort);
   }
 
-  const StreamVertex &getDefaultMasterPort() const {
+  const StreamVertex &getDefaultMasterPort() const override {
     return getMasterPort(mm2sPort);
   }
 
@@ -80,7 +90,9 @@ public:
 private:
   bool writeScatterGather(const void *buf, size_t len);
   bool readScatterGather(void *buf, size_t len);
+  XAxiDma_Bd *writeScatterGatherSetupBd(const void *buf, size_t len);
   Completion writeCompleteScatterGather();
+  XAxiDma_Bd *readScatterGatherSetupBd(void *buf, size_t len);
   Completion readCompleteScatterGather();
 
   bool writeSimple(const void *buf, size_t len);
@@ -89,8 +101,8 @@ private:
   Completion readCompleteSimple();
 
   void setupScatterGather();
-  void setupScatterGatherRingRx();
-  void setupScatterGatherRingTx();
+  void setupScatterGatherRingRx(uintptr_t physAddr, uintptr_t virtAddr);
+  void setupScatterGatherRingTx(uintptr_t physAddr, uintptr_t virtAddr);
 
   static constexpr char registerMemory[] = "Reg";
 
@@ -103,7 +115,7 @@ private:
   // Optional Scatter-Gather interface to access descriptors
   static constexpr char sgInterface[] = "M_AXI_SG";
 
-  std::list<MemoryBlockName> getMemoryBlocks() const {
+  std::list<MemoryBlockName> getMemoryBlocks() const override {
     return {registerMemory};
   }
 
@@ -114,7 +126,8 @@ private:
 
   bool configDone = false;
   // use polling to wait for DMA completion or interrupts via efds
-  bool polling = false;
+  bool polling = false; // polling mode is significantly lower latency
+  bool cyclic = false;  // not fully implemented
   // Timeout after which the DMA controller issues in interrupt if no data has been received
   // Delay is 125 x <delay> x (clock period of SG clock). SG clock is 100 MHz by default.
   int delay = 0;
@@ -128,31 +141,32 @@ private:
 
   // When using SG: ringBdSize is the maximum number of BDs usable in the ring
   // Depending on alignment, the actual number of BDs usable can be smaller
-  static constexpr size_t requestedRingBdSize = 2048;
+  // We use a single BD for transfers, because this way we can achieve the best
+  // latency. The AXI read cache in the FPGA also only supports a single BD.
+  // TODO: We could make this configurable in the future.
+  static constexpr size_t requestedRingBdSize = 1;
   static constexpr size_t requestedRingBdSizeMemory =
       requestedRingBdSize * sizeof(XAxiDma_Bd);
-  uint32_t actualRingBdSize = XAxiDma_BdRingCntCalc(
-      XAXIDMA_BD_MINIMUM_ALIGNMENT, requestedRingBdSizeMemory);
-  std::shared_ptr<MemoryBlock> sgRingTx;
-  std::shared_ptr<MemoryBlock> sgRingRx;
+  uint32_t actualRingBdSize = 1;
+  std::shared_ptr<MemoryBlock> sgRing;
 };
 
 class DmaFactory : NodeFactory {
 
 public:
-  virtual std::string getName() const { return "dma"; }
+  virtual std::string getName() const override { return "dma"; }
 
-  virtual std::string getDescription() const {
+  virtual std::string getDescription() const override {
     return "Xilinx's AXI4 Direct Memory Access Controller";
   }
 
 private:
-  virtual Vlnv getCompatibleVlnv() const {
+  virtual Vlnv getCompatibleVlnv() const override {
     return Vlnv("xilinx.com:ip:axi_dma:");
   }
 
   // Create a concrete IP instance
-  Core *make() const { return new Dma; };
+  Core *make() const override { return new Dma; };
 
 protected:
   virtual void parse(Core &ip, json_t *json) override;
diff --git a/fpga/lib/ips/dma.cpp b/fpga/lib/ips/dma.cpp
index 3442fdc18..872c0ae22 100644
--- a/fpga/lib/ips/dma.cpp
+++ b/fpga/lib/ips/dma.cpp
@@ -1,21 +1,23 @@
 /* DMA driver
  *
- * Author: Daniel Krebs <github@daniel-krebs.net>
  * Author: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
- * SPDX-FileCopyrightText: 2018 Institute for Automation of Complex Power Systems, RWTH Aachen University
+ * Author: Daniel Krebs <github@daniel-krebs.net>
+ * SPDX-FileCopyrightText: 2018-2024 Institute for Automation of Complex Power Systems, RWTH Aachen University
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <sstream>
 #include <string>
-
-#include <xilinx/xaxidma.h>
-
-#include <villas/memory.hpp>
+#include <sys/types.h>
 
 #include <villas/fpga/card.hpp>
 #include <villas/fpga/ips/dma.hpp>
 #include <villas/fpga/ips/intc.hpp>
+#include <villas/memory.hpp>
+
+#include <xilinx/xaxidma.h>
+#include <xilinx/xaxidma_bd.h>
+#include <xilinx/xaxidma_hw.h>
 
 // Max. size of a DMA transfer in simple mode
 #define FPGA_DMA_BOUNDARY 0x1000
@@ -48,10 +50,9 @@ bool Dma::init() {
   hwLock.unlock();
   // Map buffer descriptors
   if (hasScatterGather()) {
-    if (actualRingBdSize < 2 * readCoalesce ||
-        actualRingBdSize < 2 * writeCoalesce) {
+    if (actualRingBdSize < readCoalesce || actualRingBdSize < writeCoalesce) {
       throw RuntimeError(
-          "Ring buffer size is too small for coalesce value {} < 2*{}",
+          "Ring buffer size is too small for coalesce value {} < {}",
           actualRingBdSize, std::max(readCoalesce, writeCoalesce));
     }
     setupScatterGather();
@@ -67,11 +68,27 @@ bool Dma::init() {
 }
 
 void Dma::setupScatterGather() {
-  setupScatterGatherRingRx();
-  setupScatterGatherRingTx();
+  // Allocate and map space for BD ring in host RAM
+  auto &alloc = villas::HostRam::getAllocator();
+  sgRing = alloc.allocateBlock(2 * requestedRingBdSizeMemory);
+
+  if (not card->mapMemoryBlock(sgRing))
+    throw RuntimeError("Memory not accessible by DMA");
+
+  auto &mm = MemoryManager::get();
+  auto trans = mm.getTranslation(busMasterInterfaces[sgInterface],
+                                 sgRing->getAddrSpaceId());
+
+  auto physAddr = reinterpret_cast<uintptr_t>(trans.getLocalAddr(0));
+  auto virtAddr = reinterpret_cast<uintptr_t>(
+      mm.getTranslationFromProcess(sgRing->getAddrSpaceId()).getLocalAddr(0));
+  setupScatterGatherRingRx(physAddr, virtAddr);
+
+  setupScatterGatherRingTx(physAddr + requestedRingBdSizeMemory,
+                           virtAddr + requestedRingBdSizeMemory);
 }
 
-void Dma::setupScatterGatherRingRx() {
+void Dma::setupScatterGatherRingRx(uintptr_t physAddr, uintptr_t virtAddr) {
   int ret;
 
   hwLock.lock();
@@ -83,20 +100,6 @@ void Dma::setupScatterGatherRingRx() {
   // Set delay and coalescing
   XAxiDma_BdRingSetCoalesce(rxRingPtr, readCoalesce, delay);
 
-  // Allocate and map space for BD ring in host RAM
-  auto &alloc = villas::HostRam::getAllocator();
-  sgRingRx = alloc.allocateBlock(requestedRingBdSizeMemory);
-
-  if (not card->mapMemoryBlock(sgRingRx))
-    throw RuntimeError("Memory not accessible by DMA");
-
-  auto &mm = MemoryManager::get();
-  auto trans = mm.getTranslation(busMasterInterfaces[sgInterface],
-                                 sgRingRx->getAddrSpaceId());
-  auto physAddr = reinterpret_cast<uintptr_t>(trans.getLocalAddr(0));
-  auto virtAddr = reinterpret_cast<uintptr_t>(
-      mm.getTranslationFromProcess(sgRingRx->getAddrSpaceId()).getLocalAddr(0));
-
   // Setup Rx BD space
   ret = XAxiDma_BdRingCreate(rxRingPtr, physAddr, virtAddr,
                              XAXIDMA_BD_MINIMUM_ALIGNMENT, actualRingBdSize);
@@ -111,8 +114,15 @@ void Dma::setupScatterGatherRingRx() {
   if (ret != XST_SUCCESS)
     throw RuntimeError("Failed to clone BD template: {}", ret);
 
+  if (cyclic) {
+    // Enable Cyclic DMA mode
+    XAxiDma_BdRingEnableCyclicDMA(rxRingPtr);
+    XAxiDma_SelectCyclicMode(&xDma, XAXIDMA_DEVICE_TO_DMA, 1);
+  }
   // Enable completion interrupt
-  XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DEVICE_TO_DMA);
+  if (!polling) {
+    XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DEVICE_TO_DMA);
+  }
   // Start the RX channel
   ret = XAxiDma_BdRingStart(rxRingPtr);
   if (ret != XST_SUCCESS)
@@ -121,7 +131,7 @@ void Dma::setupScatterGatherRingRx() {
   hwLock.unlock();
 }
 
-void Dma::setupScatterGatherRingTx() {
+void Dma::setupScatterGatherRingTx(uintptr_t physAddr, uintptr_t virtAddr) {
   int ret;
 
   hwLock.lock();
@@ -133,20 +143,6 @@ void Dma::setupScatterGatherRingTx() {
   // Set TX delay and coalesce
   XAxiDma_BdRingSetCoalesce(txRingPtr, writeCoalesce, delay);
 
-  // Allocate and map space for BD ring in host RAM
-  auto &alloc = villas::HostRam::getAllocator();
-  sgRingTx = alloc.allocateBlock(requestedRingBdSizeMemory);
-
-  if (not card->mapMemoryBlock(sgRingTx))
-    throw RuntimeError("Memory not accessible by DMA");
-
-  auto &mm = MemoryManager::get();
-  auto trans = mm.getTranslation(busMasterInterfaces[sgInterface],
-                                 sgRingTx->getAddrSpaceId());
-  auto physAddr = reinterpret_cast<uintptr_t>(trans.getLocalAddr(0));
-  auto virtAddr = reinterpret_cast<uintptr_t>(
-      mm.getTranslationFromProcess(sgRingTx->getAddrSpaceId()).getLocalAddr(0));
-
   // Setup TxBD space
   ret = XAxiDma_BdRingCreate(txRingPtr, physAddr, virtAddr,
                              XAXIDMA_BD_MINIMUM_ALIGNMENT, actualRingBdSize);
@@ -162,7 +158,9 @@ void Dma::setupScatterGatherRingTx() {
     throw RuntimeError("Failed to clone TX ring BD: {}", ret);
 
   // Enable completion interrupt
-  XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DMA_TO_DEVICE);
+  if (!polling) {
+    XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DMA_TO_DEVICE);
+  }
   // Start the TX channel
   ret = XAxiDma_BdRingStart(txRingPtr);
   if (ret != XST_SUCCESS)
@@ -213,12 +211,9 @@ Dma::~Dma() {
       free(rxRingPtr->CyclicBd);
       rxRingPtr->CyclicBd = nullptr;
     }
-    // unampe SG memory Blocks
-    if (sgRingTx) {
-      card->unmapMemoryBlock(*sgRingTx);
-    }
-    if (sgRingRx) {
-      card->unmapMemoryBlock(*sgRingRx);
+    // Unmap SG memory Blocks
+    if (sgRing) {
+      card->unmapMemoryBlock(*sgRing);
     }
   }
   Dma::reset();
@@ -288,12 +283,58 @@ bool Dma::read(const MemoryBlock &mem, size_t len) {
                             : readSimple(buf, len);
 }
 
-//Write a single message
-bool Dma::writeScatterGather(const void *buf, size_t len) {
-  // buf is address from view of DMA controller
-
-  int ret = XST_FAILURE;
+// Reuse existing single BD bypassing BdRingFree, Alloc, ToHw
+bool Dma::writeScatterGatherFast() {
   hwLock.lock();
+  auto *txRing = XAxiDma_GetTxRing(&xDma);
+  if (txRing == nullptr) {
+    hwLock.unlock();
+    throw RuntimeError("RxRing was null.");
+  }
+  XAxiDma_Bd *CurBdPtr = txRing->HwHead;
+
+  // Clear the bit we are polling on in complete
+  uint32_t BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
+  BdSts &= ~XAXIDMA_BD_STS_COMPLETE_MASK;
+  XAxiDma_BdWrite(CurBdPtr, XAXIDMA_BD_STS_OFFSET, BdSts);
+
+  uintptr_t tdesc = ((uintptr_t)txRing->HwTail +
+                     (txRing->FirstBdPhysAddr - txRing->FirstBdAddr)) &
+                    XAXIDMA_DESC_LSB_MASK;
+  XAxiDma_WriteReg(txRing->ChanBase, XAXIDMA_TDESC_OFFSET, tdesc);
+
+  hwLock.unlock();
+  return true;
+}
+
+bool Dma::writeScatterGatherPrepare(const MemoryBlock &mem, size_t len) {
+
+  auto &mm = MemoryManager::get();
+
+  // User has to make sure that memory is accessible, otherwise this will throw
+  auto trans = mm.getTranslation(busMasterInterfaces[mm2sInterface],
+                                 mem.getAddrSpaceId());
+  void *buf = reinterpret_cast<void *>(trans.getLocalAddr(0));
+  if (buf == nullptr) {
+    throw RuntimeError("Buffer was null");
+  }
+  hwLock.lock();
+
+  auto bd = writeScatterGatherSetupBd(buf, len);
+
+  hwLock.unlock();
+
+  return bd != nullptr;
+}
+
+XAxiDma_Bd *Dma::writeScatterGatherSetupBd(const void *buf, size_t len) {
+  uint32_t ret = XST_FAILURE;
+  if (len == 0)
+    return nullptr;
+
+  if (len > FPGA_DMA_BOUNDARY)
+    return nullptr;
+
   auto *txRing = XAxiDma_GetTxRing(&xDma);
   if (txRing == nullptr) {
     hwLock.unlock();
@@ -325,7 +366,22 @@ bool Dma::writeScatterGather(const void *buf, size_t len) {
 
   // TODO: Check if we really need this
   XAxiDma_BdSetId(bd, (uintptr_t)buf);
+  return bd;
+}
 
+// Write a single message
+bool Dma::writeScatterGather(const void *buf, size_t len) {
+  // buf is address from view of DMA controller
+
+  int ret = XST_FAILURE;
+  hwLock.lock();
+  auto *txRing = XAxiDma_GetTxRing(&xDma);
+  if (txRing == nullptr) {
+    hwLock.unlock();
+    throw RuntimeError("TxRing was null.");
+  }
+
+  XAxiDma_Bd *bd = writeScatterGatherSetupBd(buf, len);
   // Give control of BD to HW. We should not access it until transfer is finished.
   // Failure could also indicate that EOF is not set on last Bd
   ret = XAxiDma_BdRingToHw(txRing, 1, bd);
@@ -339,20 +395,64 @@ bool Dma::writeScatterGather(const void *buf, size_t len) {
   return true;
 }
 
-bool Dma::readScatterGather(void *buf, size_t len) {
-  int ret = XST_FAILURE;
-
-  if (len < readCoalesce * readMsgSize) {
-    throw RuntimeError(
-        "Read size is smaller than readCoalesce*msgSize. Cannot setup BDs.");
-  }
-
+// Reuse existing single BD bypassing BdRingFree, Alloc, ToHw
+bool Dma::readScatterGatherFast() {
   hwLock.lock();
   auto *rxRing = XAxiDma_GetRxRing(&xDma);
   if (rxRing == nullptr) {
     hwLock.unlock();
     throw RuntimeError("RxRing was null.");
   }
+  XAxiDma_Bd *CurBdPtr = rxRing->HwHead;
+  // Poll BD status to avoid accessing PCIe address space
+  uint32_t BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
+
+  // Clear the bit we are polling on in complete
+  BdSts &= ~XAXIDMA_BD_STS_COMPLETE_MASK;
+  XAxiDma_BdWrite(CurBdPtr, XAXIDMA_BD_STS_OFFSET, BdSts);
+
+  uintptr_t tdesc = ((uintptr_t)rxRing->HwTail +
+                     (rxRing->FirstBdPhysAddr - rxRing->FirstBdAddr)) &
+                    XAXIDMA_DESC_LSB_MASK;
+  XAxiDma_WriteReg(rxRing->ChanBase, XAXIDMA_TDESC_OFFSET, tdesc);
+
+  hwLock.unlock();
+  return true;
+}
+
+bool Dma::readScatterGatherPrepare(const MemoryBlock &mem, size_t len) {
+
+  auto &mm = MemoryManager::get();
+
+  // User has to make sure that memory is accessible, otherwise this will throw
+  auto trans = mm.getTranslation(busMasterInterfaces[s2mmInterface],
+                                 mem.getAddrSpaceId());
+  void *buf = reinterpret_cast<void *>(trans.getLocalAddr(0));
+  if (buf == nullptr) {
+    throw RuntimeError("Buffer was null");
+  }
+  hwLock.lock();
+
+  auto bd = readScatterGatherSetupBd(buf, len);
+
+  hwLock.unlock();
+
+  return bd != nullptr;
+}
+
+XAxiDma_Bd *Dma::readScatterGatherSetupBd(void *buf, size_t len) {
+  uint32_t ret = XST_FAILURE;
+  if (len == 0)
+    return nullptr;
+
+  if (len > FPGA_DMA_BOUNDARY)
+    return nullptr;
+
+  auto *rxRing = XAxiDma_GetRxRing(&xDma);
+  if (rxRing == nullptr) {
+    hwLock.unlock();
+    throw RuntimeError("RxRing was null.");
+  }
 
   XAxiDma_Bd *bd;
   ret = XAxiDma_BdRingAlloc(rxRing, readCoalesce, &bd);
@@ -388,6 +488,25 @@ bool Dma::readScatterGather(void *buf, size_t len) {
     curBuf += readMsgSize;
     curBd = (XAxiDma_Bd *)XAxiDma_BdRingNext(rxRing, curBd);
   }
+  return bd;
+}
+
+bool Dma::readScatterGather(void *buf, size_t len) {
+  uint32_t ret = XST_FAILURE;
+
+  if (len < readCoalesce * readMsgSize) {
+    throw RuntimeError(
+        "Read size is smaller than readCoalesce*msgSize. Cannot setup BDs.");
+  }
+
+  hwLock.lock();
+  auto *rxRing = XAxiDma_GetRxRing(&xDma);
+  if (rxRing == nullptr) {
+    hwLock.unlock();
+    throw RuntimeError("RxRing was null.");
+  }
+
+  XAxiDma_Bd *bd = readScatterGatherSetupBd(buf, len);
 
   ret = XAxiDma_BdRingToHw(rxRing, readCoalesce, bd);
   if (ret != XST_SUCCESS) {
@@ -399,30 +518,36 @@ bool Dma::readScatterGather(void *buf, size_t len) {
   return true;
 }
 
+size_t Dma::writeScatterGatherPoll(bool lock) {
+  if (lock) {
+    hwLock.lock();
+  }
+  auto txRing = XAxiDma_GetTxRing(&xDma);
+  XAxiDma_Bd *CurBdPtr = txRing->HwHead;
+  volatile uint32_t BdSts;
+  // Poll BD status to avoid accessing PCIe address space
+  do {
+    BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
+  } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
+  // At this point, we know that the transmission is complete, but we haven't accessed the
+  // PCIe address space, yet. We know that we have received at least one BD.
+  if (lock) {
+    hwLock.unlock();
+  }
+  return XAxiDma_BdGetActualLength(CurBdPtr, XAXIDMA_MCHAN_MAX_TRANSFER_LEN);
+}
+
 Dma::Completion Dma::writeCompleteScatterGather() {
   Completion c;
   XAxiDma_Bd *bd = nullptr, *curBd;
   auto txRing = XAxiDma_GetTxRing(&xDma);
-  int ret = XST_FAILURE;
+  uint32_t ret = XST_FAILURE;
   static size_t errcnt = 32;
 
   uint32_t irqStatus = 0;
   if (polling) {
     hwLock.lock();
-    XAxiDma_Bd *CurBdPtr = txRing->HwHead;
-    volatile uint32_t BdSts;
-    // Poll BD status to avoid accessing PCIe address space
-    do {
-      BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
-    } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
-    // At this point, we know that the transmission is complete, but we haven't accessed the
-    // PCIe address space, yet. The subsequent DMA Controller management can be done in a
-    // separate thread to keep latencies in this thread extremly low. We know that we have
-    // received one BD.
-    do {
-      // This takes 1.5 us
-      irqStatus = XAxiDma_BdRingGetIrq(txRing);
-    } while (!(irqStatus & XAXIDMA_IRQ_IOC_MASK));
+    writeScatterGatherPoll(false);
   } else {
     c.interrupts = irqs[mm2sInterrupt].irqController->waitForInterrupt(
         irqs[mm2sInterrupt].num);
@@ -441,8 +566,8 @@ Dma::Completion Dma::writeCompleteScatterGather() {
   // Acknowledge the interrupt
   if (!polling) {
     irqStatus = XAxiDma_BdRingGetIrq(txRing);
+    XAxiDma_BdRingAckIrq(txRing, irqStatus);
   }
-  XAxiDma_BdRingAckIrq(txRing, irqStatus);
 
   if (c.bds == 0) {
     c.bytes = 0;
@@ -461,7 +586,7 @@ Dma::Completion Dma::writeCompleteScatterGather() {
     if ((ret & XAXIDMA_BD_STS_ALL_ERR_MASK) ||
         (!(ret & XAXIDMA_BD_STS_COMPLETE_MASK))) {
       hwLock.unlock();
-      throw RuntimeError("Bd Status register shows error: {}", ret);
+      throw RuntimeError("Write: Bd Status register shows error: {:#x}", ret);
     }
 
     c.bytes += XAxiDma_BdGetLength(bd, txRing->MaxTransferLen);
@@ -478,6 +603,25 @@ Dma::Completion Dma::writeCompleteScatterGather() {
   return c;
 }
 
+size_t Dma::readScatterGatherPoll(bool lock) {
+  if (lock) {
+    hwLock.lock();
+  }
+  auto rxRing = XAxiDma_GetRxRing(&xDma);
+  XAxiDma_Bd *CurBdPtr = rxRing->HwHead;
+  volatile uint32_t BdSts;
+  // Poll BD status to avoid accessing PCIe address space
+  do {
+    BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
+  } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
+  // At this point, we know that the transmission is complete, but we haven't accessed the
+  // PCIe address space, yet. We know that we have received at least one BD.
+  if (lock) {
+    hwLock.unlock();
+  }
+  return XAxiDma_BdGetActualLength(CurBdPtr, XAXIDMA_MCHAN_MAX_TRANSFER_LEN);
+}
+
 Dma::Completion Dma::readCompleteScatterGather() {
   Completion c;
   XAxiDma_Bd *bd = nullptr, *curBd;
@@ -489,20 +633,7 @@ Dma::Completion Dma::readCompleteScatterGather() {
   uint32_t irqStatus = 0;
   if (polling) {
     hwLock.lock();
-    XAxiDma_Bd *CurBdPtr = rxRing->HwHead;
-    volatile uint32_t BdSts;
-    // Poll BD status to avoid accessing PCIe address space
-    do {
-      BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
-    } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
-    // At this point, we know that the transmission is complete, but we haven't accessed the
-    // PCIe address space, yet. The subsequent DMA Controller management can be done in a
-    // separate thread to keep latencies in this thread extremly low. We know that we have
-    // received one BD.
-    do {
-      // This takes 1.5 us
-      irqStatus = XAxiDma_BdRingGetIrq(rxRing);
-    } while (!(irqStatus & XAXIDMA_IRQ_IOC_MASK));
+    readScatterGatherPoll(false);
     intrs = 1;
   } else {
     intrs = irqs[s2mmInterrupt].irqController->waitForInterrupt(
@@ -521,18 +652,17 @@ Dma::Completion Dma::readCompleteScatterGather() {
     c.interrupts = 0;
     return c;
   } else {
-    hwLock.unlock();
     c.interrupts = intrs;
   }
   if (!polling) {
     irqStatus = XAxiDma_BdRingGetIrq(rxRing);
+    XAxiDma_BdRingAckIrq(rxRing, irqStatus);
+    if (!(irqStatus & XAXIDMA_IRQ_IOC_MASK)) {
+      logger->error("Expected IOC interrupt but IRQ status is: {:#x}",
+                    irqStatus);
+      return c;
+    }
   }
-  XAxiDma_BdRingAckIrq(rxRing, irqStatus);
-  if (!(irqStatus & XAXIDMA_IRQ_IOC_MASK)) {
-    logger->error("Expected IOC interrupt but IRQ status is: {:#x}", irqStatus);
-    return c;
-  }
-
   // Wait until the data has been received by the RX channel.
   if ((c.bds = XAxiDma_BdRingFromHw(rxRing, readCoalesce, &bd)) <
       readCoalesce) {
@@ -564,7 +694,7 @@ Dma::Completion Dma::readCompleteScatterGather() {
     if ((ret & XAXIDMA_BD_STS_ALL_ERR_MASK) ||
         (!(ret & XAXIDMA_BD_STS_COMPLETE_MASK))) {
       hwLock.unlock();
-      throw RuntimeError("Bd Status register shows error: {}", ret);
+      throw RuntimeError("Read: Bd Status register shows error: {}", ret);
     }
 
     c.bytes += XAxiDma_BdGetActualLength(bd, rxRing->MaxTransferLen);
diff --git a/fpga/lib/ips/register.cpp b/fpga/lib/ips/register.cpp
index 83bb134c0..9310a1183 100644
--- a/fpga/lib/ips/register.cpp
+++ b/fpga/lib/ips/register.cpp
@@ -43,9 +43,12 @@ bool Register::check() {
   }
 
   // This is Dino specific for now - we should possibly move this to Dino in the future
-  setRegister(0, static_cast<uint32_t>(1000)); // set Dino to a rate of 20 kHz
-  setRegister(1, -0.001615254F);
-  setRegister(2, 10.8061F);
+  constexpr double dinoClk = 25e9;    // Dino is clocked with 25 Mhz
+  constexpr double sampleRate = 20e6; // We want to achieve a timestep of 50us
+  constexpr uint32_t dinoTimerVal = static_cast<uint32_t>(dinoClk / sampleRate);
+  setRegister(0, dinoTimerVal); // Timer value for generating ADC trigger signal
+  setRegister(1, -0.001615254F); // Scale factor for ADC value
+  setRegister(2, 10.8061F);      // Offset for ADC value
   uint32_t rate = getRegister(0);
   float scale = getRegisterFloat(1);
   float offset = getRegisterFloat(2);
diff --git a/include/villas/nodes/fpga.hpp b/include/villas/nodes/fpga.hpp
index da9bdebd2..a124efca3 100644
--- a/include/villas/nodes/fpga.hpp
+++ b/include/villas/nodes/fpga.hpp
@@ -9,6 +9,7 @@
 
 #pragma once
 
+#include <thread>
 #include <villas/format.hpp>
 #include <villas/node.hpp>
 #include <villas/node/config.hpp>
@@ -31,15 +32,31 @@ protected:
   std::string cardName;
   std::list<std::string> connectStrings;
 
+  // This setting improves latency by removing various checks.
+  // Use with caution! Requires read cache in FPGA design!
+  // The common use case in VILLASfpga is that we have exactly
+  // one write for every read and the number of exchanged signals
+  // do not change. If this is the case, we can reuse the buffer
+  // descriptors during reads and write, thus avoidng freeing,
+  // reallocating and setting them up.
+  // We set up the descriptors in start, and in write or read,
+  // we only reset the complete bit in the buffer descriptor and
+  // write to the tdesc register to start the DMA transfer.
+  // Improves read/write latency by approx. 40%.
+  bool lowLatencyMode;
+
   // State
   std::shared_ptr<fpga::Card> card;
   std::shared_ptr<villas::fpga::ip::Dma> dma;
-  std::shared_ptr<villas::MemoryBlock> blockRx[2];
+  std::shared_ptr<villas::MemoryBlock> blockRx;
   std::shared_ptr<villas::MemoryBlock> blockTx;
 
   // Non-public methods
+  virtual int fastRead(Sample *smps[], unsigned cnt);
+  virtual int slowRead(Sample *smps[], unsigned cnt);
   virtual int _read(Sample *smps[], unsigned cnt) override;
-
+  virtual int fastWrite(Sample *smps[], unsigned cnt);
+  virtual int slowWrite(Sample *smps[], unsigned cnt);
   virtual int _write(Sample *smps[], unsigned cnt) override;
 
 public:
@@ -55,6 +72,8 @@ public:
 
   virtual int start() override;
 
+  virtual int stop() override;
+
   virtual std::vector<int> getPollFDs() override;
 
   virtual const std::string &getDetails() override;
diff --git a/lib/nodes/fpga.cpp b/lib/nodes/fpga.cpp
index 8d0cd0975..c365436e9 100644
--- a/lib/nodes/fpga.cpp
+++ b/lib/nodes/fpga.cpp
@@ -7,6 +7,7 @@
 
 #include <memory>
 #include <string>
+#include <unistd.h>
 #include <vector>
 
 #include <jansson.h>
@@ -32,8 +33,8 @@ static std::list<std::shared_ptr<fpga::Card>> cards;
 static std::shared_ptr<kernel::vfio::Container> vfioContainer;
 
 FpgaNode::FpgaNode(const uuid_t &id, const std::string &name)
-    : Node(id, name), cardName(""), card(nullptr), dma(), blockRx(), blockTx() {
-}
+    : Node(id, name), cardName(""), connectStrings(), lowLatencyMode(false),
+      card(nullptr), dma(), blockRx(), blockTx() {}
 
 FpgaNode::~FpgaNode() {}
 
@@ -75,14 +76,12 @@ int FpgaNode::prepare() {
 
   auto &alloc = HostRam::getAllocator();
 
-  blockRx[0] = alloc.allocateBlock(0x200 * sizeof(float));
-  blockRx[1] = alloc.allocateBlock(0x200 * sizeof(float));
+  blockRx = alloc.allocateBlock(0x200 * sizeof(float));
   blockTx = alloc.allocateBlock(0x200 * sizeof(float));
-  villas::MemoryAccessor<float> memRx[] = {*(blockRx[0]), *(blockRx[1])};
+  villas::MemoryAccessor<float> memRx = *blockRx;
   villas::MemoryAccessor<float> memTx = *blockTx;
 
-  dma->makeAccesibleFromVA(blockRx[0]);
-  dma->makeAccesibleFromVA(blockRx[1]);
+  dma->makeAccesibleFromVA(blockRx);
   dma->makeAccesibleFromVA(blockTx);
 
   MemoryManager::get().printGraph();
@@ -90,6 +89,8 @@ int FpgaNode::prepare() {
   return Node::prepare();
 }
 
+int FpgaNode::stop() { return Node::stop(); }
+
 int FpgaNode::parse(json_t *json) {
   int ret = Node::parse(json);
   if (ret) {
@@ -105,8 +106,9 @@ int FpgaNode::parse(json_t *json) {
     vfioContainer = std::make_shared<kernel::vfio::Container>();
   }
 
-  ret = json_unpack_ex(json, &err, 0, "{ s: o, s?: o}", "card", &jsonCard,
-                       "connect", &jsonConnectStrings);
+  ret = json_unpack_ex(json, &err, 0, "{ s: o, s?: o, s?: b, s?: b}", "card",
+                       &jsonCard, "connect", &jsonConnectStrings,
+                       "lowLatencyMode", &lowLatencyMode);
   if (ret) {
     throw ConfigError(json, err, "node-config-fpga",
                       "Failed to parse configuration of node {}",
@@ -158,7 +160,8 @@ const std::string &FpgaNode::getDetails() {
     std::copy(connectStrings.begin(), connectStrings.end(),
               std::ostream_iterator<std::string>(imploded, delim));
 
-    details = fmt::format("fpga={}, connect={}", name, imploded.str());
+    details = fmt::format("fpga={}, connect={}, lowLatencyMode={}", name,
+                          imploded.str(), lowLatencyMode);
   }
 
   return details;
@@ -167,19 +170,99 @@ const std::string &FpgaNode::getDetails() {
 int FpgaNode::check() { return 0; }
 
 int FpgaNode::start() {
-  // enque first read
-  // dma->read(*(blockRx[0]), blockRx[0]->getSize());
+  if (getInputSignalsMaxCount() * sizeof(float) > blockRx->getSize()) {
+    logger->error("Input signals exceed block size.");
+    throw villas ::RuntimeError("Input signals exceed block size.");
+  }
+  if (lowLatencyMode) {
+    dma->readScatterGatherPrepare(*blockRx, blockRx->getSize());
+    if (getInputSignalsMaxCount() != 0) {
+      dma->writeScatterGatherPrepare(*blockTx,
+                                     getInputSignalsMaxCount() * sizeof(float));
+    } else {
+      logger->warn("No input signals defined. Not preparing write buffer - "
+                   "writes will not work.");
+    }
+  }
+
   return Node::start();
 }
 
+// We cannot modify the BD here, so writes are fixed length.
+// If fastWrite receives less signals than expected, the previous data
+// will be reused for the remaining signals
+int FpgaNode::fastWrite(Sample *smps[], unsigned cnt) {
+  Sample *smp = smps[0];
+
+  assert(cnt == 1 && smps != nullptr && smps[0] != nullptr);
+
+  auto mem = MemoryAccessor<uint32_t>(*blockTx);
+  float scaled;
+
+  for (unsigned i = 0; i < smp->length; i++) {
+    if (smp->signals->getByIndex(i)->type == SignalType::FLOAT) {
+      scaled = smp->data[i].f;
+      if (scaled > 10.) {
+        scaled = 10.;
+      } else if (scaled < -10.) {
+        scaled = -10.;
+      }
+      mem[i] = (scaled + 10.) * ((float)0xFFFF / 20.);
+    } else {
+      mem[i] = smp->data[i].i;
+    }
+  }
+
+  dma->writeScatterGatherFast();
+  auto written = dma->writeScatterGatherPoll() /
+                 sizeof(float); // The number of samples written
+
+  if (written != smp->length) {
+    logger->warn("Wrote {} samples, but {} were expected", written,
+                 smp->length);
+  }
+
+  return 1;
+}
+
+// Because we cannot modify the BD here, reads are fixed length.
+// However, if we receive less data than expected, we will return only
+// what we have received. fastRead is thus capable of partial reads.
+int FpgaNode::fastRead(Sample *smps[], unsigned cnt) {
+  Sample *smp = smps[0];
+  auto mem = MemoryAccessor<float>(*blockRx);
+
+  smp->flags = (int)SampleFlags::HAS_DATA;
+  smp->signals = in.signals;
+
+  dma->readScatterGatherFast();
+  auto read = dma->readScatterGatherPoll(true);
+  // We assume a lot without checking at this point. All for the latency!
+
+  smp->length = 0;
+  for (unsigned i = 0; i < MIN(read / sizeof(float), smp->capacity); i++) {
+    smp->data[i].f = static_cast<double>(mem[i]);
+    smp->length++;
+  }
+
+  return 1;
+}
+
 int FpgaNode::_read(Sample *smps[], unsigned cnt) {
-  static size_t cur = 0, next = 0;
+  if (lowLatencyMode) {
+    return fastRead(smps, cnt);
+  } else {
+    return slowRead(smps, cnt);
+  }
+}
+
+int FpgaNode::slowRead(Sample *smps[], unsigned cnt) {
   unsigned read;
   Sample *smp = smps[0];
 
   assert(cnt == 1);
 
-  dma->read(*(blockRx[next]), blockRx[next]->getSize()); // TODO: calc size
+  dma->read(*blockRx, blockRx->getSize());
   auto c = dma->readComplete();
 
   read = c.bytes / sizeof(float);
@@ -188,7 +271,7 @@ int FpgaNode::_read(Sample *smps[], unsigned cnt) {
     logger->warn("Missed {} interrupts", c.interrupts - 1);
   }
 
-  auto mem = MemoryAccessor<float>(*(blockRx[cur]));
+  auto mem = MemoryAccessor<float>(*blockRx);
 
   smp->length = 0;
   for (unsigned i = 0; i < MIN(read, smp->capacity); i++) {
@@ -198,14 +281,20 @@ int FpgaNode::_read(Sample *smps[], unsigned cnt) {
   smp->flags = (int)SampleFlags::HAS_DATA;
 
   smp->signals = in.signals;
-  //cur = next;
-  //next = (next + 1) % (sizeof(blockRx) / sizeof(blockRx[0]));
 
   return 1;
 }
 
 int FpgaNode::_write(Sample *smps[], unsigned cnt) {
-  unsigned int written;
+  if (lowLatencyMode) {
+    return fastWrite(smps, cnt);
+  } else {
+    return slowWrite(smps, cnt);
+  }
+}
+
+int FpgaNode::slowWrite(Sample *smps[], unsigned cnt) {
+  // unsigned int written;
   Sample *smp = smps[0];
 
   assert(cnt == 1 && smps != nullptr && smps[0] != nullptr);
@@ -224,11 +313,11 @@ int FpgaNode::_write(Sample *smps[], unsigned cnt) {
   }
 
   bool state = dma->write(*blockTx, smp->length * sizeof(float));
-  if (!state)
+  if (!state) {
     return -1;
-
-  written = dma->writeComplete().bytes /
-            sizeof(float); // The number of samples written
+  }
+  auto written = dma->writeComplete().bytes /
+                 sizeof(float); // The number of samples written
 
   if (written != smp->length) {
     logger->warn("Wrote {} samples, but {} were expected", written,