fpga: DMA: poll BD instead of hardware register

polling HW is slow (>1us). Polling RAM is faster. This is a first implementation which only polls the first BD that is active. This is why this commit also removes the second read in nodes/fpga. This is not really useful anyways. Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
2025-03-30 00:00:11 +01:00 · 2024-03-14 11:53:19 +01:00 · 2024-03-14 11:53:19 +01:00 · d9b3bdb0de
commit d9b3bdb0de
parent 2f0a10c49b
2 changed files with 66 additions and 18 deletions
--- a/fpga/lib/ips/dma.cpp
+++ b/fpga/lib/ips/dma.cpp
@ -57,11 +57,12 @@ bool Dma::init() {
    setupScatterGather();
  }

-  irqs[mm2sInterrupt].irqController->enableInterrupt(irqs[mm2sInterrupt],
-                                                     polling);
-  irqs[s2mmInterrupt].irqController->enableInterrupt(irqs[s2mmInterrupt],
-                                                     polling);
-
+  if (!polling) {
+    irqs[mm2sInterrupt].irqController->enableInterrupt(irqs[mm2sInterrupt],
+                                                       polling);
+    irqs[s2mmInterrupt].irqController->enableInterrupt(irqs[s2mmInterrupt],
+                                                       polling);
+  }
  return true;
 }

@ -341,9 +342,10 @@ bool Dma::writeScatterGather(const void *buf, size_t len) {
 bool Dma::readScatterGather(void *buf, size_t len) {
  int ret = XST_FAILURE;

-  if (len < readCoalesce * readMsgSize)
+  if (len < readCoalesce * readMsgSize) {
    throw RuntimeError(
        "Read size is smaller than readCoalesce*msgSize. Cannot setup BDs.");
+  }

  hwLock.lock();
  auto *rxRing = XAxiDma_GetRxRing(&xDma);
@ -404,10 +406,29 @@ Dma::Completion Dma::writeCompleteScatterGather() {
  int ret = XST_FAILURE;
  static size_t errcnt = 32;

-  c.interrupts = irqs[mm2sInterrupt].irqController->waitForInterrupt(
-      irqs[mm2sInterrupt].num);
+  uint32_t irqStatus = 0;
+  if (polling) {
+    hwLock.lock();
+    XAxiDma_Bd *CurBdPtr = txRing->HwHead;
+    volatile uint32_t BdSts;
+    // Poll BD status to avoid accessing PCIe address space
+    do {
+      BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
+    } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
+    // At this point, we know that the transmission is complete, but we haven't accessed the
+    // PCIe address space, yet. The subsequent DMA Controller management can be done in a
+    // separate thread to keep latencies in this thread extremly low. We know that we have
+    // received one BD.
+    do {
+      // This takes 1.5 us
+      irqStatus = XAxiDma_BdRingGetIrq(txRing);
+    } while (!(irqStatus & XAXIDMA_IRQ_IOC_MASK));
+  } else {
+    c.interrupts = irqs[mm2sInterrupt].irqController->waitForInterrupt(
+        irqs[mm2sInterrupt].num);
+    hwLock.lock();
+  }

-  hwLock.lock();
  if ((c.bds = XAxiDma_BdRingFromHw(txRing, writeCoalesce, &bd)) <
      writeCoalesce) {
    logger->warn("Send partial batch of {}/{} BDs.", c.bds, writeCoalesce);
@ -418,7 +439,9 @@ Dma::Completion Dma::writeCompleteScatterGather() {
  }

  // Acknowledge the interrupt
-  auto irqStatus = XAxiDma_BdRingGetIrq(txRing);
+  if (!polling) {
+    irqStatus = XAxiDma_BdRingGetIrq(txRing);
+  }
  XAxiDma_BdRingAckIrq(txRing, irqStatus);

  if (c.bds == 0) {
@ -462,8 +485,30 @@ Dma::Completion Dma::readCompleteScatterGather() {
  int ret = XST_FAILURE;
  static size_t errcnt = 32;

-  ssize_t intrs = irqs[s2mmInterrupt].irqController->waitForInterrupt(
-      irqs[s2mmInterrupt].num);
+  ssize_t intrs = 0;
+  uint32_t irqStatus = 0;
+  if (polling) {
+    hwLock.lock();
+    XAxiDma_Bd *CurBdPtr = rxRing->HwHead;
+    volatile uint32_t BdSts;
+    // Poll BD status to avoid accessing PCIe address space
+    do {
+      BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
+    } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
+    // At this point, we know that the transmission is complete, but we haven't accessed the
+    // PCIe address space, yet. The subsequent DMA Controller management can be done in a
+    // separate thread to keep latencies in this thread extremly low. We know that we have
+    // received one BD.
+    do {
+      // This takes 1.5 us
+      irqStatus = XAxiDma_BdRingGetIrq(rxRing);
+    } while (!(irqStatus & XAXIDMA_IRQ_IOC_MASK));
+    intrs = 1;
+  } else {
+    intrs = irqs[s2mmInterrupt].irqController->waitForInterrupt(
+        irqs[s2mmInterrupt].num);
+    hwLock.lock();
+  }

  if (intrs < 0) {
    logger->warn("Interrupt error or timeout: {}", intrs);
@ -471,14 +516,17 @@ Dma::Completion Dma::readCompleteScatterGather() {
    // Free all RX BDs for future transmission.
    int bds = XAxiDma_BdRingFromHw(rxRing, XAXIDMA_ALL_BDS, &bd);
    XAxiDma_BdRingFree(rxRing, bds, bd);
+    hwLock.unlock();

    c.interrupts = 0;
    return c;
  } else {
+    hwLock.unlock();
    c.interrupts = intrs;
  }
-  hwLock.lock();
-  auto irqStatus = XAxiDma_BdRingGetIrq(rxRing);
+  if (!polling) {
+    irqStatus = XAxiDma_BdRingGetIrq(rxRing);
+  }
  XAxiDma_BdRingAckIrq(rxRing, irqStatus);
  if (!(irqStatus & XAXIDMA_IRQ_IOC_MASK)) {
    logger->error("Expected IOC interrupt but IRQ status is: {:#x}", irqStatus);
--- a/lib/nodes/fpga.cpp
+++ b/lib/nodes/fpga.cpp
@ -169,12 +169,12 @@ int FpgaNode::check() { return 0; }

 int FpgaNode::start() {
  // enque first read
-  dma->read(*(blockRx[0]), blockRx[0]->getSize());
+  // dma->read(*(blockRx[0]), blockRx[0]->getSize());
  return Node::start();
 }

 int FpgaNode::_read(Sample *smps[], unsigned cnt) {
-  static size_t cur = 0, next = 1;
+  static size_t cur = 0, next = 0;
  unsigned read;
  Sample *smp = smps[0];

@ -199,8 +199,8 @@ int FpgaNode::_read(Sample *smps[], unsigned cnt) {
  smp->flags = (int)SampleFlags::HAS_DATA;

  smp->signals = in.signals;
-  cur = next;
-  next = (next + 1) % (sizeof(blockRx) / sizeof(blockRx[0]));
+  //cur = next;
+  //next = (next + 1) % (sizeof(blockRx) / sizeof(blockRx[0]));

  return 1;
 }