diff --git a/etc/examples/nodes/fpga.conf b/etc/examples/nodes/fpga.conf index e768cc183..697e2a8df 100644 --- a/etc/examples/nodes/fpga.conf +++ b/etc/examples/nodes/fpga.conf @@ -11,7 +11,7 @@ fpgas = { id = "10ee:7021", slot = "0000:88:00.0", do_reset = true, - ips = "../../../fpga/etc/vc707-xbar-pcie/vc707-xbar-pcie.json", + ips = "../../fpga/etc/vc707-xbar-pcie/vc707-xbar-pcie-dino-v2.json", polling = false, } } diff --git a/fpga/include/villas/fpga/ips/dma.hpp b/fpga/include/villas/fpga/ips/dma.hpp index e29cc093a..ce0c899da 100644 --- a/fpga/include/villas/fpga/ips/dma.hpp +++ b/fpga/include/villas/fpga/ips/dma.hpp @@ -10,10 +10,12 @@ #pragma once #include + #include #include #include #include + #include namespace villas { @@ -49,6 +51,14 @@ public: : writeCompleteSimple(); } + bool readScatterGatherPrepare(const MemoryBlock &mem, size_t len); + bool readScatterGatherFast(); + size_t readScatterGatherPoll(bool lock = true); + + bool writeScatterGatherPrepare(const MemoryBlock &mem, size_t len); + bool writeScatterGatherFast(); + size_t writeScatterGatherPoll(bool lock = true); + Completion readComplete() { return hasScatterGather() ? readCompleteScatterGather() : readCompleteSimple(); @@ -61,11 +71,11 @@ public: inline bool hasScatterGather() const { return xConfig.HasSg; } - const StreamVertex &getDefaultSlavePort() const { + const StreamVertex &getDefaultSlavePort() const override { return getSlavePort(s2mmPort); } - const StreamVertex &getDefaultMasterPort() const { + const StreamVertex &getDefaultMasterPort() const override { return getMasterPort(mm2sPort); } @@ -80,7 +90,9 @@ public: private: bool writeScatterGather(const void *buf, size_t len); bool readScatterGather(void *buf, size_t len); + XAxiDma_Bd *writeScatterGatherSetupBd(const void *buf, size_t len); Completion writeCompleteScatterGather(); + XAxiDma_Bd *readScatterGatherSetupBd(void *buf, size_t len); Completion readCompleteScatterGather(); bool writeSimple(const void *buf, size_t len); @@ -89,8 +101,8 @@ private: Completion readCompleteSimple(); void setupScatterGather(); - void setupScatterGatherRingRx(); - void setupScatterGatherRingTx(); + void setupScatterGatherRingRx(uintptr_t physAddr, uintptr_t virtAddr); + void setupScatterGatherRingTx(uintptr_t physAddr, uintptr_t virtAddr); static constexpr char registerMemory[] = "Reg"; @@ -103,7 +115,7 @@ private: // Optional Scatter-Gather interface to access descriptors static constexpr char sgInterface[] = "M_AXI_SG"; - std::list getMemoryBlocks() const { + std::list getMemoryBlocks() const override { return {registerMemory}; } @@ -114,7 +126,8 @@ private: bool configDone = false; // use polling to wait for DMA completion or interrupts via efds - bool polling = false; + bool polling = false; // polling mode is significantly lower latency + bool cyclic = false; // not fully implemented // Timeout after which the DMA controller issues in interrupt if no data has been received // Delay is 125 x x (clock period of SG clock). SG clock is 100 MHz by default. int delay = 0; @@ -128,31 +141,32 @@ private: // When using SG: ringBdSize is the maximum number of BDs usable in the ring // Depending on alignment, the actual number of BDs usable can be smaller - static constexpr size_t requestedRingBdSize = 2048; + // We use a single BD for transfers, because this way we can achieve the best + // latency. The AXI read cache in the FPGA also only supports a single BD. + // TODO: We could make this configurable in the future. + static constexpr size_t requestedRingBdSize = 1; static constexpr size_t requestedRingBdSizeMemory = requestedRingBdSize * sizeof(XAxiDma_Bd); - uint32_t actualRingBdSize = XAxiDma_BdRingCntCalc( - XAXIDMA_BD_MINIMUM_ALIGNMENT, requestedRingBdSizeMemory); - std::shared_ptr sgRingTx; - std::shared_ptr sgRingRx; + uint32_t actualRingBdSize = 1; + std::shared_ptr sgRing; }; class DmaFactory : NodeFactory { public: - virtual std::string getName() const { return "dma"; } + virtual std::string getName() const override { return "dma"; } - virtual std::string getDescription() const { + virtual std::string getDescription() const override { return "Xilinx's AXI4 Direct Memory Access Controller"; } private: - virtual Vlnv getCompatibleVlnv() const { + virtual Vlnv getCompatibleVlnv() const override { return Vlnv("xilinx.com:ip:axi_dma:"); } // Create a concrete IP instance - Core *make() const { return new Dma; }; + Core *make() const override { return new Dma; }; protected: virtual void parse(Core &ip, json_t *json) override; diff --git a/fpga/lib/ips/dma.cpp b/fpga/lib/ips/dma.cpp index 3442fdc18..872c0ae22 100644 --- a/fpga/lib/ips/dma.cpp +++ b/fpga/lib/ips/dma.cpp @@ -1,21 +1,23 @@ /* DMA driver * - * Author: Daniel Krebs * Author: Niklas Eiling - * SPDX-FileCopyrightText: 2018 Institute for Automation of Complex Power Systems, RWTH Aachen University + * Author: Daniel Krebs + * SPDX-FileCopyrightText: 2018-2024 Institute for Automation of Complex Power Systems, RWTH Aachen University * SPDX-License-Identifier: Apache-2.0 */ #include #include - -#include - -#include +#include #include #include #include +#include + +#include +#include +#include // Max. size of a DMA transfer in simple mode #define FPGA_DMA_BOUNDARY 0x1000 @@ -48,10 +50,9 @@ bool Dma::init() { hwLock.unlock(); // Map buffer descriptors if (hasScatterGather()) { - if (actualRingBdSize < 2 * readCoalesce || - actualRingBdSize < 2 * writeCoalesce) { + if (actualRingBdSize < readCoalesce || actualRingBdSize < writeCoalesce) { throw RuntimeError( - "Ring buffer size is too small for coalesce value {} < 2*{}", + "Ring buffer size is too small for coalesce value {} < {}", actualRingBdSize, std::max(readCoalesce, writeCoalesce)); } setupScatterGather(); @@ -67,11 +68,27 @@ bool Dma::init() { } void Dma::setupScatterGather() { - setupScatterGatherRingRx(); - setupScatterGatherRingTx(); + // Allocate and map space for BD ring in host RAM + auto &alloc = villas::HostRam::getAllocator(); + sgRing = alloc.allocateBlock(2 * requestedRingBdSizeMemory); + + if (not card->mapMemoryBlock(sgRing)) + throw RuntimeError("Memory not accessible by DMA"); + + auto &mm = MemoryManager::get(); + auto trans = mm.getTranslation(busMasterInterfaces[sgInterface], + sgRing->getAddrSpaceId()); + + auto physAddr = reinterpret_cast(trans.getLocalAddr(0)); + auto virtAddr = reinterpret_cast( + mm.getTranslationFromProcess(sgRing->getAddrSpaceId()).getLocalAddr(0)); + setupScatterGatherRingRx(physAddr, virtAddr); + + setupScatterGatherRingTx(physAddr + requestedRingBdSizeMemory, + virtAddr + requestedRingBdSizeMemory); } -void Dma::setupScatterGatherRingRx() { +void Dma::setupScatterGatherRingRx(uintptr_t physAddr, uintptr_t virtAddr) { int ret; hwLock.lock(); @@ -83,20 +100,6 @@ void Dma::setupScatterGatherRingRx() { // Set delay and coalescing XAxiDma_BdRingSetCoalesce(rxRingPtr, readCoalesce, delay); - // Allocate and map space for BD ring in host RAM - auto &alloc = villas::HostRam::getAllocator(); - sgRingRx = alloc.allocateBlock(requestedRingBdSizeMemory); - - if (not card->mapMemoryBlock(sgRingRx)) - throw RuntimeError("Memory not accessible by DMA"); - - auto &mm = MemoryManager::get(); - auto trans = mm.getTranslation(busMasterInterfaces[sgInterface], - sgRingRx->getAddrSpaceId()); - auto physAddr = reinterpret_cast(trans.getLocalAddr(0)); - auto virtAddr = reinterpret_cast( - mm.getTranslationFromProcess(sgRingRx->getAddrSpaceId()).getLocalAddr(0)); - // Setup Rx BD space ret = XAxiDma_BdRingCreate(rxRingPtr, physAddr, virtAddr, XAXIDMA_BD_MINIMUM_ALIGNMENT, actualRingBdSize); @@ -111,8 +114,15 @@ void Dma::setupScatterGatherRingRx() { if (ret != XST_SUCCESS) throw RuntimeError("Failed to clone BD template: {}", ret); + if (cyclic) { + // Enable Cyclic DMA mode + XAxiDma_BdRingEnableCyclicDMA(rxRingPtr); + XAxiDma_SelectCyclicMode(&xDma, XAXIDMA_DEVICE_TO_DMA, 1); + } // Enable completion interrupt - XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DEVICE_TO_DMA); + if (!polling) { + XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DEVICE_TO_DMA); + } // Start the RX channel ret = XAxiDma_BdRingStart(rxRingPtr); if (ret != XST_SUCCESS) @@ -121,7 +131,7 @@ void Dma::setupScatterGatherRingRx() { hwLock.unlock(); } -void Dma::setupScatterGatherRingTx() { +void Dma::setupScatterGatherRingTx(uintptr_t physAddr, uintptr_t virtAddr) { int ret; hwLock.lock(); @@ -133,20 +143,6 @@ void Dma::setupScatterGatherRingTx() { // Set TX delay and coalesce XAxiDma_BdRingSetCoalesce(txRingPtr, writeCoalesce, delay); - // Allocate and map space for BD ring in host RAM - auto &alloc = villas::HostRam::getAllocator(); - sgRingTx = alloc.allocateBlock(requestedRingBdSizeMemory); - - if (not card->mapMemoryBlock(sgRingTx)) - throw RuntimeError("Memory not accessible by DMA"); - - auto &mm = MemoryManager::get(); - auto trans = mm.getTranslation(busMasterInterfaces[sgInterface], - sgRingTx->getAddrSpaceId()); - auto physAddr = reinterpret_cast(trans.getLocalAddr(0)); - auto virtAddr = reinterpret_cast( - mm.getTranslationFromProcess(sgRingTx->getAddrSpaceId()).getLocalAddr(0)); - // Setup TxBD space ret = XAxiDma_BdRingCreate(txRingPtr, physAddr, virtAddr, XAXIDMA_BD_MINIMUM_ALIGNMENT, actualRingBdSize); @@ -162,7 +158,9 @@ void Dma::setupScatterGatherRingTx() { throw RuntimeError("Failed to clone TX ring BD: {}", ret); // Enable completion interrupt - XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DMA_TO_DEVICE); + if (!polling) { + XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DMA_TO_DEVICE); + } // Start the TX channel ret = XAxiDma_BdRingStart(txRingPtr); if (ret != XST_SUCCESS) @@ -213,12 +211,9 @@ Dma::~Dma() { free(rxRingPtr->CyclicBd); rxRingPtr->CyclicBd = nullptr; } - // unampe SG memory Blocks - if (sgRingTx) { - card->unmapMemoryBlock(*sgRingTx); - } - if (sgRingRx) { - card->unmapMemoryBlock(*sgRingRx); + // Unmap SG memory Blocks + if (sgRing) { + card->unmapMemoryBlock(*sgRing); } } Dma::reset(); @@ -288,12 +283,58 @@ bool Dma::read(const MemoryBlock &mem, size_t len) { : readSimple(buf, len); } -//Write a single message -bool Dma::writeScatterGather(const void *buf, size_t len) { - // buf is address from view of DMA controller - - int ret = XST_FAILURE; +// Reuse existing single BD bypassing BdRingFree, Alloc, ToHw +bool Dma::writeScatterGatherFast() { hwLock.lock(); + auto *txRing = XAxiDma_GetTxRing(&xDma); + if (txRing == nullptr) { + hwLock.unlock(); + throw RuntimeError("RxRing was null."); + } + XAxiDma_Bd *CurBdPtr = txRing->HwHead; + + // Clear the bit we are polling on in complete + uint32_t BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET); + BdSts &= ~XAXIDMA_BD_STS_COMPLETE_MASK; + XAxiDma_BdWrite(CurBdPtr, XAXIDMA_BD_STS_OFFSET, BdSts); + + uintptr_t tdesc = ((uintptr_t)txRing->HwTail + + (txRing->FirstBdPhysAddr - txRing->FirstBdAddr)) & + XAXIDMA_DESC_LSB_MASK; + XAxiDma_WriteReg(txRing->ChanBase, XAXIDMA_TDESC_OFFSET, tdesc); + + hwLock.unlock(); + return true; +} + +bool Dma::writeScatterGatherPrepare(const MemoryBlock &mem, size_t len) { + + auto &mm = MemoryManager::get(); + + // User has to make sure that memory is accessible, otherwise this will throw + auto trans = mm.getTranslation(busMasterInterfaces[mm2sInterface], + mem.getAddrSpaceId()); + void *buf = reinterpret_cast(trans.getLocalAddr(0)); + if (buf == nullptr) { + throw RuntimeError("Buffer was null"); + } + hwLock.lock(); + + auto bd = writeScatterGatherSetupBd(buf, len); + + hwLock.unlock(); + + return bd != nullptr; +} + +XAxiDma_Bd *Dma::writeScatterGatherSetupBd(const void *buf, size_t len) { + uint32_t ret = XST_FAILURE; + if (len == 0) + return nullptr; + + if (len > FPGA_DMA_BOUNDARY) + return nullptr; + auto *txRing = XAxiDma_GetTxRing(&xDma); if (txRing == nullptr) { hwLock.unlock(); @@ -325,7 +366,22 @@ bool Dma::writeScatterGather(const void *buf, size_t len) { // TODO: Check if we really need this XAxiDma_BdSetId(bd, (uintptr_t)buf); + return bd; +} +// Write a single message +bool Dma::writeScatterGather(const void *buf, size_t len) { + // buf is address from view of DMA controller + + int ret = XST_FAILURE; + hwLock.lock(); + auto *txRing = XAxiDma_GetTxRing(&xDma); + if (txRing == nullptr) { + hwLock.unlock(); + throw RuntimeError("TxRing was null."); + } + + XAxiDma_Bd *bd = writeScatterGatherSetupBd(buf, len); // Give control of BD to HW. We should not access it until transfer is finished. // Failure could also indicate that EOF is not set on last Bd ret = XAxiDma_BdRingToHw(txRing, 1, bd); @@ -339,20 +395,64 @@ bool Dma::writeScatterGather(const void *buf, size_t len) { return true; } -bool Dma::readScatterGather(void *buf, size_t len) { - int ret = XST_FAILURE; - - if (len < readCoalesce * readMsgSize) { - throw RuntimeError( - "Read size is smaller than readCoalesce*msgSize. Cannot setup BDs."); - } - +// Reuse existing single BD bypassing BdRingFree, Alloc, ToHw +bool Dma::readScatterGatherFast() { hwLock.lock(); auto *rxRing = XAxiDma_GetRxRing(&xDma); if (rxRing == nullptr) { hwLock.unlock(); throw RuntimeError("RxRing was null."); } + XAxiDma_Bd *CurBdPtr = rxRing->HwHead; + // Poll BD status to avoid accessing PCIe address space + uint32_t BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET); + + // Clear the bit we are polling on in complete + BdSts &= ~XAXIDMA_BD_STS_COMPLETE_MASK; + XAxiDma_BdWrite(CurBdPtr, XAXIDMA_BD_STS_OFFSET, BdSts); + + uintptr_t tdesc = ((uintptr_t)rxRing->HwTail + + (rxRing->FirstBdPhysAddr - rxRing->FirstBdAddr)) & + XAXIDMA_DESC_LSB_MASK; + XAxiDma_WriteReg(rxRing->ChanBase, XAXIDMA_TDESC_OFFSET, tdesc); + + hwLock.unlock(); + return true; +} + +bool Dma::readScatterGatherPrepare(const MemoryBlock &mem, size_t len) { + + auto &mm = MemoryManager::get(); + + // User has to make sure that memory is accessible, otherwise this will throw + auto trans = mm.getTranslation(busMasterInterfaces[s2mmInterface], + mem.getAddrSpaceId()); + void *buf = reinterpret_cast(trans.getLocalAddr(0)); + if (buf == nullptr) { + throw RuntimeError("Buffer was null"); + } + hwLock.lock(); + + auto bd = readScatterGatherSetupBd(buf, len); + + hwLock.unlock(); + + return bd != nullptr; +} + +XAxiDma_Bd *Dma::readScatterGatherSetupBd(void *buf, size_t len) { + uint32_t ret = XST_FAILURE; + if (len == 0) + return nullptr; + + if (len > FPGA_DMA_BOUNDARY) + return nullptr; + + auto *rxRing = XAxiDma_GetRxRing(&xDma); + if (rxRing == nullptr) { + hwLock.unlock(); + throw RuntimeError("RxRing was null."); + } XAxiDma_Bd *bd; ret = XAxiDma_BdRingAlloc(rxRing, readCoalesce, &bd); @@ -388,6 +488,25 @@ bool Dma::readScatterGather(void *buf, size_t len) { curBuf += readMsgSize; curBd = (XAxiDma_Bd *)XAxiDma_BdRingNext(rxRing, curBd); } + return bd; +} + +bool Dma::readScatterGather(void *buf, size_t len) { + uint32_t ret = XST_FAILURE; + + if (len < readCoalesce * readMsgSize) { + throw RuntimeError( + "Read size is smaller than readCoalesce*msgSize. Cannot setup BDs."); + } + + hwLock.lock(); + auto *rxRing = XAxiDma_GetRxRing(&xDma); + if (rxRing == nullptr) { + hwLock.unlock(); + throw RuntimeError("RxRing was null."); + } + + XAxiDma_Bd *bd = readScatterGatherSetupBd(buf, len); ret = XAxiDma_BdRingToHw(rxRing, readCoalesce, bd); if (ret != XST_SUCCESS) { @@ -399,30 +518,36 @@ bool Dma::readScatterGather(void *buf, size_t len) { return true; } +size_t Dma::writeScatterGatherPoll(bool lock) { + if (lock) { + hwLock.lock(); + } + auto txRing = XAxiDma_GetTxRing(&xDma); + XAxiDma_Bd *CurBdPtr = txRing->HwHead; + volatile uint32_t BdSts; + // Poll BD status to avoid accessing PCIe address space + do { + BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET); + } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK)); + // At this point, we know that the transmission is complete, but we haven't accessed the + // PCIe address space, yet. We know that we have received at least one BD. + if (lock) { + hwLock.unlock(); + } + return XAxiDma_BdGetActualLength(CurBdPtr, XAXIDMA_MCHAN_MAX_TRANSFER_LEN); +} + Dma::Completion Dma::writeCompleteScatterGather() { Completion c; XAxiDma_Bd *bd = nullptr, *curBd; auto txRing = XAxiDma_GetTxRing(&xDma); - int ret = XST_FAILURE; + uint32_t ret = XST_FAILURE; static size_t errcnt = 32; uint32_t irqStatus = 0; if (polling) { hwLock.lock(); - XAxiDma_Bd *CurBdPtr = txRing->HwHead; - volatile uint32_t BdSts; - // Poll BD status to avoid accessing PCIe address space - do { - BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET); - } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK)); - // At this point, we know that the transmission is complete, but we haven't accessed the - // PCIe address space, yet. The subsequent DMA Controller management can be done in a - // separate thread to keep latencies in this thread extremly low. We know that we have - // received one BD. - do { - // This takes 1.5 us - irqStatus = XAxiDma_BdRingGetIrq(txRing); - } while (!(irqStatus & XAXIDMA_IRQ_IOC_MASK)); + writeScatterGatherPoll(false); } else { c.interrupts = irqs[mm2sInterrupt].irqController->waitForInterrupt( irqs[mm2sInterrupt].num); @@ -441,8 +566,8 @@ Dma::Completion Dma::writeCompleteScatterGather() { // Acknowledge the interrupt if (!polling) { irqStatus = XAxiDma_BdRingGetIrq(txRing); + XAxiDma_BdRingAckIrq(txRing, irqStatus); } - XAxiDma_BdRingAckIrq(txRing, irqStatus); if (c.bds == 0) { c.bytes = 0; @@ -461,7 +586,7 @@ Dma::Completion Dma::writeCompleteScatterGather() { if ((ret & XAXIDMA_BD_STS_ALL_ERR_MASK) || (!(ret & XAXIDMA_BD_STS_COMPLETE_MASK))) { hwLock.unlock(); - throw RuntimeError("Bd Status register shows error: {}", ret); + throw RuntimeError("Write: Bd Status register shows error: {:#x}", ret); } c.bytes += XAxiDma_BdGetLength(bd, txRing->MaxTransferLen); @@ -478,6 +603,25 @@ Dma::Completion Dma::writeCompleteScatterGather() { return c; } +size_t Dma::readScatterGatherPoll(bool lock) { + if (lock) { + hwLock.lock(); + } + auto rxRing = XAxiDma_GetRxRing(&xDma); + XAxiDma_Bd *CurBdPtr = rxRing->HwHead; + volatile uint32_t BdSts; + // Poll BD status to avoid accessing PCIe address space + do { + BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET); + } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK)); + // At this point, we know that the transmission is complete, but we haven't accessed the + // PCIe address space, yet. We know that we have received at least one BD. + if (lock) { + hwLock.unlock(); + } + return XAxiDma_BdGetActualLength(CurBdPtr, XAXIDMA_MCHAN_MAX_TRANSFER_LEN); +} + Dma::Completion Dma::readCompleteScatterGather() { Completion c; XAxiDma_Bd *bd = nullptr, *curBd; @@ -489,20 +633,7 @@ Dma::Completion Dma::readCompleteScatterGather() { uint32_t irqStatus = 0; if (polling) { hwLock.lock(); - XAxiDma_Bd *CurBdPtr = rxRing->HwHead; - volatile uint32_t BdSts; - // Poll BD status to avoid accessing PCIe address space - do { - BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET); - } while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK)); - // At this point, we know that the transmission is complete, but we haven't accessed the - // PCIe address space, yet. The subsequent DMA Controller management can be done in a - // separate thread to keep latencies in this thread extremly low. We know that we have - // received one BD. - do { - // This takes 1.5 us - irqStatus = XAxiDma_BdRingGetIrq(rxRing); - } while (!(irqStatus & XAXIDMA_IRQ_IOC_MASK)); + readScatterGatherPoll(false); intrs = 1; } else { intrs = irqs[s2mmInterrupt].irqController->waitForInterrupt( @@ -521,18 +652,17 @@ Dma::Completion Dma::readCompleteScatterGather() { c.interrupts = 0; return c; } else { - hwLock.unlock(); c.interrupts = intrs; } if (!polling) { irqStatus = XAxiDma_BdRingGetIrq(rxRing); + XAxiDma_BdRingAckIrq(rxRing, irqStatus); + if (!(irqStatus & XAXIDMA_IRQ_IOC_MASK)) { + logger->error("Expected IOC interrupt but IRQ status is: {:#x}", + irqStatus); + return c; + } } - XAxiDma_BdRingAckIrq(rxRing, irqStatus); - if (!(irqStatus & XAXIDMA_IRQ_IOC_MASK)) { - logger->error("Expected IOC interrupt but IRQ status is: {:#x}", irqStatus); - return c; - } - // Wait until the data has been received by the RX channel. if ((c.bds = XAxiDma_BdRingFromHw(rxRing, readCoalesce, &bd)) < readCoalesce) { @@ -564,7 +694,7 @@ Dma::Completion Dma::readCompleteScatterGather() { if ((ret & XAXIDMA_BD_STS_ALL_ERR_MASK) || (!(ret & XAXIDMA_BD_STS_COMPLETE_MASK))) { hwLock.unlock(); - throw RuntimeError("Bd Status register shows error: {}", ret); + throw RuntimeError("Read: Bd Status register shows error: {}", ret); } c.bytes += XAxiDma_BdGetActualLength(bd, rxRing->MaxTransferLen); diff --git a/fpga/lib/ips/register.cpp b/fpga/lib/ips/register.cpp index 83bb134c0..9310a1183 100644 --- a/fpga/lib/ips/register.cpp +++ b/fpga/lib/ips/register.cpp @@ -43,9 +43,12 @@ bool Register::check() { } // This is Dino specific for now - we should possibly move this to Dino in the future - setRegister(0, static_cast(1000)); // set Dino to a rate of 20 kHz - setRegister(1, -0.001615254F); - setRegister(2, 10.8061F); + constexpr double dinoClk = 25e9; // Dino is clocked with 25 Mhz + constexpr double sampleRate = 20e6; // We want to achieve a timestep of 50us + constexpr uint32_t dinoTimerVal = static_cast(dinoClk / sampleRate); + setRegister(0, dinoTimerVal); // Timer value for generating ADC trigger signal + setRegister(1, -0.001615254F); // Scale factor for ADC value + setRegister(2, 10.8061F); // Offset for ADC value uint32_t rate = getRegister(0); float scale = getRegisterFloat(1); float offset = getRegisterFloat(2); diff --git a/include/villas/nodes/fpga.hpp b/include/villas/nodes/fpga.hpp index da9bdebd2..a124efca3 100644 --- a/include/villas/nodes/fpga.hpp +++ b/include/villas/nodes/fpga.hpp @@ -9,6 +9,7 @@ #pragma once +#include #include #include #include @@ -31,15 +32,31 @@ protected: std::string cardName; std::list connectStrings; + // This setting improves latency by removing various checks. + // Use with caution! Requires read cache in FPGA design! + // The common use case in VILLASfpga is that we have exactly + // one write for every read and the number of exchanged signals + // do not change. If this is the case, we can reuse the buffer + // descriptors during reads and write, thus avoidng freeing, + // reallocating and setting them up. + // We set up the descriptors in start, and in write or read, + // we only reset the complete bit in the buffer descriptor and + // write to the tdesc register to start the DMA transfer. + // Improves read/write latency by approx. 40%. + bool lowLatencyMode; + // State std::shared_ptr card; std::shared_ptr dma; - std::shared_ptr blockRx[2]; + std::shared_ptr blockRx; std::shared_ptr blockTx; // Non-public methods + virtual int fastRead(Sample *smps[], unsigned cnt); + virtual int slowRead(Sample *smps[], unsigned cnt); virtual int _read(Sample *smps[], unsigned cnt) override; - + virtual int fastWrite(Sample *smps[], unsigned cnt); + virtual int slowWrite(Sample *smps[], unsigned cnt); virtual int _write(Sample *smps[], unsigned cnt) override; public: @@ -55,6 +72,8 @@ public: virtual int start() override; + virtual int stop() override; + virtual std::vector getPollFDs() override; virtual const std::string &getDetails() override; diff --git a/lib/nodes/fpga.cpp b/lib/nodes/fpga.cpp index 8d0cd0975..c365436e9 100644 --- a/lib/nodes/fpga.cpp +++ b/lib/nodes/fpga.cpp @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -32,8 +33,8 @@ static std::list> cards; static std::shared_ptr vfioContainer; FpgaNode::FpgaNode(const uuid_t &id, const std::string &name) - : Node(id, name), cardName(""), card(nullptr), dma(), blockRx(), blockTx() { -} + : Node(id, name), cardName(""), connectStrings(), lowLatencyMode(false), + card(nullptr), dma(), blockRx(), blockTx() {} FpgaNode::~FpgaNode() {} @@ -75,14 +76,12 @@ int FpgaNode::prepare() { auto &alloc = HostRam::getAllocator(); - blockRx[0] = alloc.allocateBlock(0x200 * sizeof(float)); - blockRx[1] = alloc.allocateBlock(0x200 * sizeof(float)); + blockRx = alloc.allocateBlock(0x200 * sizeof(float)); blockTx = alloc.allocateBlock(0x200 * sizeof(float)); - villas::MemoryAccessor memRx[] = {*(blockRx[0]), *(blockRx[1])}; + villas::MemoryAccessor memRx = *blockRx; villas::MemoryAccessor memTx = *blockTx; - dma->makeAccesibleFromVA(blockRx[0]); - dma->makeAccesibleFromVA(blockRx[1]); + dma->makeAccesibleFromVA(blockRx); dma->makeAccesibleFromVA(blockTx); MemoryManager::get().printGraph(); @@ -90,6 +89,8 @@ int FpgaNode::prepare() { return Node::prepare(); } +int FpgaNode::stop() { return Node::stop(); } + int FpgaNode::parse(json_t *json) { int ret = Node::parse(json); if (ret) { @@ -105,8 +106,9 @@ int FpgaNode::parse(json_t *json) { vfioContainer = std::make_shared(); } - ret = json_unpack_ex(json, &err, 0, "{ s: o, s?: o}", "card", &jsonCard, - "connect", &jsonConnectStrings); + ret = json_unpack_ex(json, &err, 0, "{ s: o, s?: o, s?: b, s?: b}", "card", + &jsonCard, "connect", &jsonConnectStrings, + "lowLatencyMode", &lowLatencyMode); if (ret) { throw ConfigError(json, err, "node-config-fpga", "Failed to parse configuration of node {}", @@ -158,7 +160,8 @@ const std::string &FpgaNode::getDetails() { std::copy(connectStrings.begin(), connectStrings.end(), std::ostream_iterator(imploded, delim)); - details = fmt::format("fpga={}, connect={}", name, imploded.str()); + details = fmt::format("fpga={}, connect={}, lowLatencyMode={}", name, + imploded.str(), lowLatencyMode); } return details; @@ -167,19 +170,99 @@ const std::string &FpgaNode::getDetails() { int FpgaNode::check() { return 0; } int FpgaNode::start() { - // enque first read - // dma->read(*(blockRx[0]), blockRx[0]->getSize()); + if (getInputSignalsMaxCount() * sizeof(float) > blockRx->getSize()) { + logger->error("Input signals exceed block size."); + throw villas ::RuntimeError("Input signals exceed block size."); + } + if (lowLatencyMode) { + dma->readScatterGatherPrepare(*blockRx, blockRx->getSize()); + if (getInputSignalsMaxCount() != 0) { + dma->writeScatterGatherPrepare(*blockTx, + getInputSignalsMaxCount() * sizeof(float)); + } else { + logger->warn("No input signals defined. Not preparing write buffer - " + "writes will not work."); + } + } + return Node::start(); } +// We cannot modify the BD here, so writes are fixed length. +// If fastWrite receives less signals than expected, the previous data +// will be reused for the remaining signals +int FpgaNode::fastWrite(Sample *smps[], unsigned cnt) { + Sample *smp = smps[0]; + + assert(cnt == 1 && smps != nullptr && smps[0] != nullptr); + + auto mem = MemoryAccessor(*blockTx); + float scaled; + + for (unsigned i = 0; i < smp->length; i++) { + if (smp->signals->getByIndex(i)->type == SignalType::FLOAT) { + scaled = smp->data[i].f; + if (scaled > 10.) { + scaled = 10.; + } else if (scaled < -10.) { + scaled = -10.; + } + mem[i] = (scaled + 10.) * ((float)0xFFFF / 20.); + } else { + mem[i] = smp->data[i].i; + } + } + + dma->writeScatterGatherFast(); + auto written = dma->writeScatterGatherPoll() / + sizeof(float); // The number of samples written + + if (written != smp->length) { + logger->warn("Wrote {} samples, but {} were expected", written, + smp->length); + } + + return 1; +} + +// Because we cannot modify the BD here, reads are fixed length. +// However, if we receive less data than expected, we will return only +// what we have received. fastRead is thus capable of partial reads. +int FpgaNode::fastRead(Sample *smps[], unsigned cnt) { + Sample *smp = smps[0]; + auto mem = MemoryAccessor(*blockRx); + + smp->flags = (int)SampleFlags::HAS_DATA; + smp->signals = in.signals; + + dma->readScatterGatherFast(); + auto read = dma->readScatterGatherPoll(true); + // We assume a lot without checking at this point. All for the latency! + + smp->length = 0; + for (unsigned i = 0; i < MIN(read / sizeof(float), smp->capacity); i++) { + smp->data[i].f = static_cast(mem[i]); + smp->length++; + } + + return 1; +} + int FpgaNode::_read(Sample *smps[], unsigned cnt) { - static size_t cur = 0, next = 0; + if (lowLatencyMode) { + return fastRead(smps, cnt); + } else { + return slowRead(smps, cnt); + } +} + +int FpgaNode::slowRead(Sample *smps[], unsigned cnt) { unsigned read; Sample *smp = smps[0]; assert(cnt == 1); - dma->read(*(blockRx[next]), blockRx[next]->getSize()); // TODO: calc size + dma->read(*blockRx, blockRx->getSize()); auto c = dma->readComplete(); read = c.bytes / sizeof(float); @@ -188,7 +271,7 @@ int FpgaNode::_read(Sample *smps[], unsigned cnt) { logger->warn("Missed {} interrupts", c.interrupts - 1); } - auto mem = MemoryAccessor(*(blockRx[cur])); + auto mem = MemoryAccessor(*blockRx); smp->length = 0; for (unsigned i = 0; i < MIN(read, smp->capacity); i++) { @@ -198,14 +281,20 @@ int FpgaNode::_read(Sample *smps[], unsigned cnt) { smp->flags = (int)SampleFlags::HAS_DATA; smp->signals = in.signals; - //cur = next; - //next = (next + 1) % (sizeof(blockRx) / sizeof(blockRx[0])); return 1; } int FpgaNode::_write(Sample *smps[], unsigned cnt) { - unsigned int written; + if (lowLatencyMode) { + return fastWrite(smps, cnt); + } else { + return slowWrite(smps, cnt); + } +} + +int FpgaNode::slowWrite(Sample *smps[], unsigned cnt) { + // unsigned int written; Sample *smp = smps[0]; assert(cnt == 1 && smps != nullptr && smps[0] != nullptr); @@ -224,11 +313,11 @@ int FpgaNode::_write(Sample *smps[], unsigned cnt) { } bool state = dma->write(*blockTx, smp->length * sizeof(float)); - if (!state) + if (!state) { return -1; - - written = dma->writeComplete().bytes / - sizeof(float); // The number of samples written + } + auto written = dma->writeComplete().bytes / + sizeof(float); // The number of samples written if (written != smp->length) { logger->warn("Wrote {} samples, but {} were expected", written,