mirror of
https://git.rwth-aachen.de/acs/public/villas/node/
synced 2025-03-09 00:00:00 +01:00
Merge remote-tracking branch 'origin/master' into node-smu
This commit is contained in:
commit
9cd8729646
6 changed files with 399 additions and 144 deletions
|
@ -11,7 +11,7 @@ fpgas = {
|
|||
id = "10ee:7021",
|
||||
slot = "0000:88:00.0",
|
||||
do_reset = true,
|
||||
ips = "../../../fpga/etc/vc707-xbar-pcie/vc707-xbar-pcie.json",
|
||||
ips = "../../fpga/etc/vc707-xbar-pcie/vc707-xbar-pcie-dino-v2.json",
|
||||
polling = false,
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,10 +10,12 @@
|
|||
#pragma once
|
||||
|
||||
#include <fmt/ostream.h>
|
||||
|
||||
#include <villas/config.hpp>
|
||||
#include <villas/exceptions.hpp>
|
||||
#include <villas/fpga/node.hpp>
|
||||
#include <villas/memory.hpp>
|
||||
|
||||
#include <xilinx/xaxidma.h>
|
||||
|
||||
namespace villas {
|
||||
|
@ -49,6 +51,14 @@ public:
|
|||
: writeCompleteSimple();
|
||||
}
|
||||
|
||||
bool readScatterGatherPrepare(const MemoryBlock &mem, size_t len);
|
||||
bool readScatterGatherFast();
|
||||
size_t readScatterGatherPoll(bool lock = true);
|
||||
|
||||
bool writeScatterGatherPrepare(const MemoryBlock &mem, size_t len);
|
||||
bool writeScatterGatherFast();
|
||||
size_t writeScatterGatherPoll(bool lock = true);
|
||||
|
||||
Completion readComplete() {
|
||||
return hasScatterGather() ? readCompleteScatterGather()
|
||||
: readCompleteSimple();
|
||||
|
@ -61,11 +71,11 @@ public:
|
|||
|
||||
inline bool hasScatterGather() const { return xConfig.HasSg; }
|
||||
|
||||
const StreamVertex &getDefaultSlavePort() const {
|
||||
const StreamVertex &getDefaultSlavePort() const override {
|
||||
return getSlavePort(s2mmPort);
|
||||
}
|
||||
|
||||
const StreamVertex &getDefaultMasterPort() const {
|
||||
const StreamVertex &getDefaultMasterPort() const override {
|
||||
return getMasterPort(mm2sPort);
|
||||
}
|
||||
|
||||
|
@ -80,7 +90,9 @@ public:
|
|||
private:
|
||||
bool writeScatterGather(const void *buf, size_t len);
|
||||
bool readScatterGather(void *buf, size_t len);
|
||||
XAxiDma_Bd *writeScatterGatherSetupBd(const void *buf, size_t len);
|
||||
Completion writeCompleteScatterGather();
|
||||
XAxiDma_Bd *readScatterGatherSetupBd(void *buf, size_t len);
|
||||
Completion readCompleteScatterGather();
|
||||
|
||||
bool writeSimple(const void *buf, size_t len);
|
||||
|
@ -89,8 +101,8 @@ private:
|
|||
Completion readCompleteSimple();
|
||||
|
||||
void setupScatterGather();
|
||||
void setupScatterGatherRingRx();
|
||||
void setupScatterGatherRingTx();
|
||||
void setupScatterGatherRingRx(uintptr_t physAddr, uintptr_t virtAddr);
|
||||
void setupScatterGatherRingTx(uintptr_t physAddr, uintptr_t virtAddr);
|
||||
|
||||
static constexpr char registerMemory[] = "Reg";
|
||||
|
||||
|
@ -103,7 +115,7 @@ private:
|
|||
// Optional Scatter-Gather interface to access descriptors
|
||||
static constexpr char sgInterface[] = "M_AXI_SG";
|
||||
|
||||
std::list<MemoryBlockName> getMemoryBlocks() const {
|
||||
std::list<MemoryBlockName> getMemoryBlocks() const override {
|
||||
return {registerMemory};
|
||||
}
|
||||
|
||||
|
@ -114,7 +126,8 @@ private:
|
|||
|
||||
bool configDone = false;
|
||||
// use polling to wait for DMA completion or interrupts via efds
|
||||
bool polling = false;
|
||||
bool polling = false; // polling mode is significantly lower latency
|
||||
bool cyclic = false; // not fully implemented
|
||||
// Timeout after which the DMA controller issues in interrupt if no data has been received
|
||||
// Delay is 125 x <delay> x (clock period of SG clock). SG clock is 100 MHz by default.
|
||||
int delay = 0;
|
||||
|
@ -128,31 +141,32 @@ private:
|
|||
|
||||
// When using SG: ringBdSize is the maximum number of BDs usable in the ring
|
||||
// Depending on alignment, the actual number of BDs usable can be smaller
|
||||
static constexpr size_t requestedRingBdSize = 2048;
|
||||
// We use a single BD for transfers, because this way we can achieve the best
|
||||
// latency. The AXI read cache in the FPGA also only supports a single BD.
|
||||
// TODO: We could make this configurable in the future.
|
||||
static constexpr size_t requestedRingBdSize = 1;
|
||||
static constexpr size_t requestedRingBdSizeMemory =
|
||||
requestedRingBdSize * sizeof(XAxiDma_Bd);
|
||||
uint32_t actualRingBdSize = XAxiDma_BdRingCntCalc(
|
||||
XAXIDMA_BD_MINIMUM_ALIGNMENT, requestedRingBdSizeMemory);
|
||||
std::shared_ptr<MemoryBlock> sgRingTx;
|
||||
std::shared_ptr<MemoryBlock> sgRingRx;
|
||||
uint32_t actualRingBdSize = 1;
|
||||
std::shared_ptr<MemoryBlock> sgRing;
|
||||
};
|
||||
|
||||
class DmaFactory : NodeFactory {
|
||||
|
||||
public:
|
||||
virtual std::string getName() const { return "dma"; }
|
||||
virtual std::string getName() const override { return "dma"; }
|
||||
|
||||
virtual std::string getDescription() const {
|
||||
virtual std::string getDescription() const override {
|
||||
return "Xilinx's AXI4 Direct Memory Access Controller";
|
||||
}
|
||||
|
||||
private:
|
||||
virtual Vlnv getCompatibleVlnv() const {
|
||||
virtual Vlnv getCompatibleVlnv() const override {
|
||||
return Vlnv("xilinx.com:ip:axi_dma:");
|
||||
}
|
||||
|
||||
// Create a concrete IP instance
|
||||
Core *make() const { return new Dma; };
|
||||
Core *make() const override { return new Dma; };
|
||||
|
||||
protected:
|
||||
virtual void parse(Core &ip, json_t *json) override;
|
||||
|
|
|
@ -1,21 +1,23 @@
|
|||
/* DMA driver
|
||||
*
|
||||
* Author: Daniel Krebs <github@daniel-krebs.net>
|
||||
* Author: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
|
||||
* SPDX-FileCopyrightText: 2018 Institute for Automation of Complex Power Systems, RWTH Aachen University
|
||||
* Author: Daniel Krebs <github@daniel-krebs.net>
|
||||
* SPDX-FileCopyrightText: 2018-2024 Institute for Automation of Complex Power Systems, RWTH Aachen University
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include <xilinx/xaxidma.h>
|
||||
|
||||
#include <villas/memory.hpp>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include <villas/fpga/card.hpp>
|
||||
#include <villas/fpga/ips/dma.hpp>
|
||||
#include <villas/fpga/ips/intc.hpp>
|
||||
#include <villas/memory.hpp>
|
||||
|
||||
#include <xilinx/xaxidma.h>
|
||||
#include <xilinx/xaxidma_bd.h>
|
||||
#include <xilinx/xaxidma_hw.h>
|
||||
|
||||
// Max. size of a DMA transfer in simple mode
|
||||
#define FPGA_DMA_BOUNDARY 0x1000
|
||||
|
@ -48,10 +50,9 @@ bool Dma::init() {
|
|||
hwLock.unlock();
|
||||
// Map buffer descriptors
|
||||
if (hasScatterGather()) {
|
||||
if (actualRingBdSize < 2 * readCoalesce ||
|
||||
actualRingBdSize < 2 * writeCoalesce) {
|
||||
if (actualRingBdSize < readCoalesce || actualRingBdSize < writeCoalesce) {
|
||||
throw RuntimeError(
|
||||
"Ring buffer size is too small for coalesce value {} < 2*{}",
|
||||
"Ring buffer size is too small for coalesce value {} < {}",
|
||||
actualRingBdSize, std::max(readCoalesce, writeCoalesce));
|
||||
}
|
||||
setupScatterGather();
|
||||
|
@ -67,11 +68,27 @@ bool Dma::init() {
|
|||
}
|
||||
|
||||
void Dma::setupScatterGather() {
|
||||
setupScatterGatherRingRx();
|
||||
setupScatterGatherRingTx();
|
||||
// Allocate and map space for BD ring in host RAM
|
||||
auto &alloc = villas::HostRam::getAllocator();
|
||||
sgRing = alloc.allocateBlock(2 * requestedRingBdSizeMemory);
|
||||
|
||||
if (not card->mapMemoryBlock(sgRing))
|
||||
throw RuntimeError("Memory not accessible by DMA");
|
||||
|
||||
auto &mm = MemoryManager::get();
|
||||
auto trans = mm.getTranslation(busMasterInterfaces[sgInterface],
|
||||
sgRing->getAddrSpaceId());
|
||||
|
||||
auto physAddr = reinterpret_cast<uintptr_t>(trans.getLocalAddr(0));
|
||||
auto virtAddr = reinterpret_cast<uintptr_t>(
|
||||
mm.getTranslationFromProcess(sgRing->getAddrSpaceId()).getLocalAddr(0));
|
||||
setupScatterGatherRingRx(physAddr, virtAddr);
|
||||
|
||||
setupScatterGatherRingTx(physAddr + requestedRingBdSizeMemory,
|
||||
virtAddr + requestedRingBdSizeMemory);
|
||||
}
|
||||
|
||||
void Dma::setupScatterGatherRingRx() {
|
||||
void Dma::setupScatterGatherRingRx(uintptr_t physAddr, uintptr_t virtAddr) {
|
||||
int ret;
|
||||
|
||||
hwLock.lock();
|
||||
|
@ -83,20 +100,6 @@ void Dma::setupScatterGatherRingRx() {
|
|||
// Set delay and coalescing
|
||||
XAxiDma_BdRingSetCoalesce(rxRingPtr, readCoalesce, delay);
|
||||
|
||||
// Allocate and map space for BD ring in host RAM
|
||||
auto &alloc = villas::HostRam::getAllocator();
|
||||
sgRingRx = alloc.allocateBlock(requestedRingBdSizeMemory);
|
||||
|
||||
if (not card->mapMemoryBlock(sgRingRx))
|
||||
throw RuntimeError("Memory not accessible by DMA");
|
||||
|
||||
auto &mm = MemoryManager::get();
|
||||
auto trans = mm.getTranslation(busMasterInterfaces[sgInterface],
|
||||
sgRingRx->getAddrSpaceId());
|
||||
auto physAddr = reinterpret_cast<uintptr_t>(trans.getLocalAddr(0));
|
||||
auto virtAddr = reinterpret_cast<uintptr_t>(
|
||||
mm.getTranslationFromProcess(sgRingRx->getAddrSpaceId()).getLocalAddr(0));
|
||||
|
||||
// Setup Rx BD space
|
||||
ret = XAxiDma_BdRingCreate(rxRingPtr, physAddr, virtAddr,
|
||||
XAXIDMA_BD_MINIMUM_ALIGNMENT, actualRingBdSize);
|
||||
|
@ -111,8 +114,15 @@ void Dma::setupScatterGatherRingRx() {
|
|||
if (ret != XST_SUCCESS)
|
||||
throw RuntimeError("Failed to clone BD template: {}", ret);
|
||||
|
||||
if (cyclic) {
|
||||
// Enable Cyclic DMA mode
|
||||
XAxiDma_BdRingEnableCyclicDMA(rxRingPtr);
|
||||
XAxiDma_SelectCyclicMode(&xDma, XAXIDMA_DEVICE_TO_DMA, 1);
|
||||
}
|
||||
// Enable completion interrupt
|
||||
XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DEVICE_TO_DMA);
|
||||
if (!polling) {
|
||||
XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DEVICE_TO_DMA);
|
||||
}
|
||||
// Start the RX channel
|
||||
ret = XAxiDma_BdRingStart(rxRingPtr);
|
||||
if (ret != XST_SUCCESS)
|
||||
|
@ -121,7 +131,7 @@ void Dma::setupScatterGatherRingRx() {
|
|||
hwLock.unlock();
|
||||
}
|
||||
|
||||
void Dma::setupScatterGatherRingTx() {
|
||||
void Dma::setupScatterGatherRingTx(uintptr_t physAddr, uintptr_t virtAddr) {
|
||||
int ret;
|
||||
|
||||
hwLock.lock();
|
||||
|
@ -133,20 +143,6 @@ void Dma::setupScatterGatherRingTx() {
|
|||
// Set TX delay and coalesce
|
||||
XAxiDma_BdRingSetCoalesce(txRingPtr, writeCoalesce, delay);
|
||||
|
||||
// Allocate and map space for BD ring in host RAM
|
||||
auto &alloc = villas::HostRam::getAllocator();
|
||||
sgRingTx = alloc.allocateBlock(requestedRingBdSizeMemory);
|
||||
|
||||
if (not card->mapMemoryBlock(sgRingTx))
|
||||
throw RuntimeError("Memory not accessible by DMA");
|
||||
|
||||
auto &mm = MemoryManager::get();
|
||||
auto trans = mm.getTranslation(busMasterInterfaces[sgInterface],
|
||||
sgRingTx->getAddrSpaceId());
|
||||
auto physAddr = reinterpret_cast<uintptr_t>(trans.getLocalAddr(0));
|
||||
auto virtAddr = reinterpret_cast<uintptr_t>(
|
||||
mm.getTranslationFromProcess(sgRingTx->getAddrSpaceId()).getLocalAddr(0));
|
||||
|
||||
// Setup TxBD space
|
||||
ret = XAxiDma_BdRingCreate(txRingPtr, physAddr, virtAddr,
|
||||
XAXIDMA_BD_MINIMUM_ALIGNMENT, actualRingBdSize);
|
||||
|
@ -162,7 +158,9 @@ void Dma::setupScatterGatherRingTx() {
|
|||
throw RuntimeError("Failed to clone TX ring BD: {}", ret);
|
||||
|
||||
// Enable completion interrupt
|
||||
XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DMA_TO_DEVICE);
|
||||
if (!polling) {
|
||||
XAxiDma_IntrEnable(&xDma, XAXIDMA_IRQ_IOC_MASK, XAXIDMA_DMA_TO_DEVICE);
|
||||
}
|
||||
// Start the TX channel
|
||||
ret = XAxiDma_BdRingStart(txRingPtr);
|
||||
if (ret != XST_SUCCESS)
|
||||
|
@ -213,12 +211,9 @@ Dma::~Dma() {
|
|||
free(rxRingPtr->CyclicBd);
|
||||
rxRingPtr->CyclicBd = nullptr;
|
||||
}
|
||||
// unampe SG memory Blocks
|
||||
if (sgRingTx) {
|
||||
card->unmapMemoryBlock(*sgRingTx);
|
||||
}
|
||||
if (sgRingRx) {
|
||||
card->unmapMemoryBlock(*sgRingRx);
|
||||
// Unmap SG memory Blocks
|
||||
if (sgRing) {
|
||||
card->unmapMemoryBlock(*sgRing);
|
||||
}
|
||||
}
|
||||
Dma::reset();
|
||||
|
@ -288,12 +283,58 @@ bool Dma::read(const MemoryBlock &mem, size_t len) {
|
|||
: readSimple(buf, len);
|
||||
}
|
||||
|
||||
//Write a single message
|
||||
bool Dma::writeScatterGather(const void *buf, size_t len) {
|
||||
// buf is address from view of DMA controller
|
||||
|
||||
int ret = XST_FAILURE;
|
||||
// Reuse existing single BD bypassing BdRingFree, Alloc, ToHw
|
||||
bool Dma::writeScatterGatherFast() {
|
||||
hwLock.lock();
|
||||
auto *txRing = XAxiDma_GetTxRing(&xDma);
|
||||
if (txRing == nullptr) {
|
||||
hwLock.unlock();
|
||||
throw RuntimeError("RxRing was null.");
|
||||
}
|
||||
XAxiDma_Bd *CurBdPtr = txRing->HwHead;
|
||||
|
||||
// Clear the bit we are polling on in complete
|
||||
uint32_t BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
|
||||
BdSts &= ~XAXIDMA_BD_STS_COMPLETE_MASK;
|
||||
XAxiDma_BdWrite(CurBdPtr, XAXIDMA_BD_STS_OFFSET, BdSts);
|
||||
|
||||
uintptr_t tdesc = ((uintptr_t)txRing->HwTail +
|
||||
(txRing->FirstBdPhysAddr - txRing->FirstBdAddr)) &
|
||||
XAXIDMA_DESC_LSB_MASK;
|
||||
XAxiDma_WriteReg(txRing->ChanBase, XAXIDMA_TDESC_OFFSET, tdesc);
|
||||
|
||||
hwLock.unlock();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Dma::writeScatterGatherPrepare(const MemoryBlock &mem, size_t len) {
|
||||
|
||||
auto &mm = MemoryManager::get();
|
||||
|
||||
// User has to make sure that memory is accessible, otherwise this will throw
|
||||
auto trans = mm.getTranslation(busMasterInterfaces[mm2sInterface],
|
||||
mem.getAddrSpaceId());
|
||||
void *buf = reinterpret_cast<void *>(trans.getLocalAddr(0));
|
||||
if (buf == nullptr) {
|
||||
throw RuntimeError("Buffer was null");
|
||||
}
|
||||
hwLock.lock();
|
||||
|
||||
auto bd = writeScatterGatherSetupBd(buf, len);
|
||||
|
||||
hwLock.unlock();
|
||||
|
||||
return bd != nullptr;
|
||||
}
|
||||
|
||||
XAxiDma_Bd *Dma::writeScatterGatherSetupBd(const void *buf, size_t len) {
|
||||
uint32_t ret = XST_FAILURE;
|
||||
if (len == 0)
|
||||
return nullptr;
|
||||
|
||||
if (len > FPGA_DMA_BOUNDARY)
|
||||
return nullptr;
|
||||
|
||||
auto *txRing = XAxiDma_GetTxRing(&xDma);
|
||||
if (txRing == nullptr) {
|
||||
hwLock.unlock();
|
||||
|
@ -325,7 +366,22 @@ bool Dma::writeScatterGather(const void *buf, size_t len) {
|
|||
|
||||
// TODO: Check if we really need this
|
||||
XAxiDma_BdSetId(bd, (uintptr_t)buf);
|
||||
return bd;
|
||||
}
|
||||
|
||||
// Write a single message
|
||||
bool Dma::writeScatterGather(const void *buf, size_t len) {
|
||||
// buf is address from view of DMA controller
|
||||
|
||||
int ret = XST_FAILURE;
|
||||
hwLock.lock();
|
||||
auto *txRing = XAxiDma_GetTxRing(&xDma);
|
||||
if (txRing == nullptr) {
|
||||
hwLock.unlock();
|
||||
throw RuntimeError("TxRing was null.");
|
||||
}
|
||||
|
||||
XAxiDma_Bd *bd = writeScatterGatherSetupBd(buf, len);
|
||||
// Give control of BD to HW. We should not access it until transfer is finished.
|
||||
// Failure could also indicate that EOF is not set on last Bd
|
||||
ret = XAxiDma_BdRingToHw(txRing, 1, bd);
|
||||
|
@ -339,20 +395,64 @@ bool Dma::writeScatterGather(const void *buf, size_t len) {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool Dma::readScatterGather(void *buf, size_t len) {
|
||||
int ret = XST_FAILURE;
|
||||
|
||||
if (len < readCoalesce * readMsgSize) {
|
||||
throw RuntimeError(
|
||||
"Read size is smaller than readCoalesce*msgSize. Cannot setup BDs.");
|
||||
}
|
||||
|
||||
// Reuse existing single BD bypassing BdRingFree, Alloc, ToHw
|
||||
bool Dma::readScatterGatherFast() {
|
||||
hwLock.lock();
|
||||
auto *rxRing = XAxiDma_GetRxRing(&xDma);
|
||||
if (rxRing == nullptr) {
|
||||
hwLock.unlock();
|
||||
throw RuntimeError("RxRing was null.");
|
||||
}
|
||||
XAxiDma_Bd *CurBdPtr = rxRing->HwHead;
|
||||
// Poll BD status to avoid accessing PCIe address space
|
||||
uint32_t BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
|
||||
|
||||
// Clear the bit we are polling on in complete
|
||||
BdSts &= ~XAXIDMA_BD_STS_COMPLETE_MASK;
|
||||
XAxiDma_BdWrite(CurBdPtr, XAXIDMA_BD_STS_OFFSET, BdSts);
|
||||
|
||||
uintptr_t tdesc = ((uintptr_t)rxRing->HwTail +
|
||||
(rxRing->FirstBdPhysAddr - rxRing->FirstBdAddr)) &
|
||||
XAXIDMA_DESC_LSB_MASK;
|
||||
XAxiDma_WriteReg(rxRing->ChanBase, XAXIDMA_TDESC_OFFSET, tdesc);
|
||||
|
||||
hwLock.unlock();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Dma::readScatterGatherPrepare(const MemoryBlock &mem, size_t len) {
|
||||
|
||||
auto &mm = MemoryManager::get();
|
||||
|
||||
// User has to make sure that memory is accessible, otherwise this will throw
|
||||
auto trans = mm.getTranslation(busMasterInterfaces[s2mmInterface],
|
||||
mem.getAddrSpaceId());
|
||||
void *buf = reinterpret_cast<void *>(trans.getLocalAddr(0));
|
||||
if (buf == nullptr) {
|
||||
throw RuntimeError("Buffer was null");
|
||||
}
|
||||
hwLock.lock();
|
||||
|
||||
auto bd = readScatterGatherSetupBd(buf, len);
|
||||
|
||||
hwLock.unlock();
|
||||
|
||||
return bd != nullptr;
|
||||
}
|
||||
|
||||
XAxiDma_Bd *Dma::readScatterGatherSetupBd(void *buf, size_t len) {
|
||||
uint32_t ret = XST_FAILURE;
|
||||
if (len == 0)
|
||||
return nullptr;
|
||||
|
||||
if (len > FPGA_DMA_BOUNDARY)
|
||||
return nullptr;
|
||||
|
||||
auto *rxRing = XAxiDma_GetRxRing(&xDma);
|
||||
if (rxRing == nullptr) {
|
||||
hwLock.unlock();
|
||||
throw RuntimeError("RxRing was null.");
|
||||
}
|
||||
|
||||
XAxiDma_Bd *bd;
|
||||
ret = XAxiDma_BdRingAlloc(rxRing, readCoalesce, &bd);
|
||||
|
@ -388,6 +488,25 @@ bool Dma::readScatterGather(void *buf, size_t len) {
|
|||
curBuf += readMsgSize;
|
||||
curBd = (XAxiDma_Bd *)XAxiDma_BdRingNext(rxRing, curBd);
|
||||
}
|
||||
return bd;
|
||||
}
|
||||
|
||||
bool Dma::readScatterGather(void *buf, size_t len) {
|
||||
uint32_t ret = XST_FAILURE;
|
||||
|
||||
if (len < readCoalesce * readMsgSize) {
|
||||
throw RuntimeError(
|
||||
"Read size is smaller than readCoalesce*msgSize. Cannot setup BDs.");
|
||||
}
|
||||
|
||||
hwLock.lock();
|
||||
auto *rxRing = XAxiDma_GetRxRing(&xDma);
|
||||
if (rxRing == nullptr) {
|
||||
hwLock.unlock();
|
||||
throw RuntimeError("RxRing was null.");
|
||||
}
|
||||
|
||||
XAxiDma_Bd *bd = readScatterGatherSetupBd(buf, len);
|
||||
|
||||
ret = XAxiDma_BdRingToHw(rxRing, readCoalesce, bd);
|
||||
if (ret != XST_SUCCESS) {
|
||||
|
@ -399,30 +518,36 @@ bool Dma::readScatterGather(void *buf, size_t len) {
|
|||
return true;
|
||||
}
|
||||
|
||||
size_t Dma::writeScatterGatherPoll(bool lock) {
|
||||
if (lock) {
|
||||
hwLock.lock();
|
||||
}
|
||||
auto txRing = XAxiDma_GetTxRing(&xDma);
|
||||
XAxiDma_Bd *CurBdPtr = txRing->HwHead;
|
||||
volatile uint32_t BdSts;
|
||||
// Poll BD status to avoid accessing PCIe address space
|
||||
do {
|
||||
BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
|
||||
} while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
|
||||
// At this point, we know that the transmission is complete, but we haven't accessed the
|
||||
// PCIe address space, yet. We know that we have received at least one BD.
|
||||
if (lock) {
|
||||
hwLock.unlock();
|
||||
}
|
||||
return XAxiDma_BdGetActualLength(CurBdPtr, XAXIDMA_MCHAN_MAX_TRANSFER_LEN);
|
||||
}
|
||||
|
||||
Dma::Completion Dma::writeCompleteScatterGather() {
|
||||
Completion c;
|
||||
XAxiDma_Bd *bd = nullptr, *curBd;
|
||||
auto txRing = XAxiDma_GetTxRing(&xDma);
|
||||
int ret = XST_FAILURE;
|
||||
uint32_t ret = XST_FAILURE;
|
||||
static size_t errcnt = 32;
|
||||
|
||||
uint32_t irqStatus = 0;
|
||||
if (polling) {
|
||||
hwLock.lock();
|
||||
XAxiDma_Bd *CurBdPtr = txRing->HwHead;
|
||||
volatile uint32_t BdSts;
|
||||
// Poll BD status to avoid accessing PCIe address space
|
||||
do {
|
||||
BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
|
||||
} while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
|
||||
// At this point, we know that the transmission is complete, but we haven't accessed the
|
||||
// PCIe address space, yet. The subsequent DMA Controller management can be done in a
|
||||
// separate thread to keep latencies in this thread extremly low. We know that we have
|
||||
// received one BD.
|
||||
do {
|
||||
// This takes 1.5 us
|
||||
irqStatus = XAxiDma_BdRingGetIrq(txRing);
|
||||
} while (!(irqStatus & XAXIDMA_IRQ_IOC_MASK));
|
||||
writeScatterGatherPoll(false);
|
||||
} else {
|
||||
c.interrupts = irqs[mm2sInterrupt].irqController->waitForInterrupt(
|
||||
irqs[mm2sInterrupt].num);
|
||||
|
@ -441,8 +566,8 @@ Dma::Completion Dma::writeCompleteScatterGather() {
|
|||
// Acknowledge the interrupt
|
||||
if (!polling) {
|
||||
irqStatus = XAxiDma_BdRingGetIrq(txRing);
|
||||
XAxiDma_BdRingAckIrq(txRing, irqStatus);
|
||||
}
|
||||
XAxiDma_BdRingAckIrq(txRing, irqStatus);
|
||||
|
||||
if (c.bds == 0) {
|
||||
c.bytes = 0;
|
||||
|
@ -461,7 +586,7 @@ Dma::Completion Dma::writeCompleteScatterGather() {
|
|||
if ((ret & XAXIDMA_BD_STS_ALL_ERR_MASK) ||
|
||||
(!(ret & XAXIDMA_BD_STS_COMPLETE_MASK))) {
|
||||
hwLock.unlock();
|
||||
throw RuntimeError("Bd Status register shows error: {}", ret);
|
||||
throw RuntimeError("Write: Bd Status register shows error: {:#x}", ret);
|
||||
}
|
||||
|
||||
c.bytes += XAxiDma_BdGetLength(bd, txRing->MaxTransferLen);
|
||||
|
@ -478,6 +603,25 @@ Dma::Completion Dma::writeCompleteScatterGather() {
|
|||
return c;
|
||||
}
|
||||
|
||||
size_t Dma::readScatterGatherPoll(bool lock) {
|
||||
if (lock) {
|
||||
hwLock.lock();
|
||||
}
|
||||
auto rxRing = XAxiDma_GetRxRing(&xDma);
|
||||
XAxiDma_Bd *CurBdPtr = rxRing->HwHead;
|
||||
volatile uint32_t BdSts;
|
||||
// Poll BD status to avoid accessing PCIe address space
|
||||
do {
|
||||
BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
|
||||
} while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
|
||||
// At this point, we know that the transmission is complete, but we haven't accessed the
|
||||
// PCIe address space, yet. We know that we have received at least one BD.
|
||||
if (lock) {
|
||||
hwLock.unlock();
|
||||
}
|
||||
return XAxiDma_BdGetActualLength(CurBdPtr, XAXIDMA_MCHAN_MAX_TRANSFER_LEN);
|
||||
}
|
||||
|
||||
Dma::Completion Dma::readCompleteScatterGather() {
|
||||
Completion c;
|
||||
XAxiDma_Bd *bd = nullptr, *curBd;
|
||||
|
@ -489,20 +633,7 @@ Dma::Completion Dma::readCompleteScatterGather() {
|
|||
uint32_t irqStatus = 0;
|
||||
if (polling) {
|
||||
hwLock.lock();
|
||||
XAxiDma_Bd *CurBdPtr = rxRing->HwHead;
|
||||
volatile uint32_t BdSts;
|
||||
// Poll BD status to avoid accessing PCIe address space
|
||||
do {
|
||||
BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
|
||||
} while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
|
||||
// At this point, we know that the transmission is complete, but we haven't accessed the
|
||||
// PCIe address space, yet. The subsequent DMA Controller management can be done in a
|
||||
// separate thread to keep latencies in this thread extremly low. We know that we have
|
||||
// received one BD.
|
||||
do {
|
||||
// This takes 1.5 us
|
||||
irqStatus = XAxiDma_BdRingGetIrq(rxRing);
|
||||
} while (!(irqStatus & XAXIDMA_IRQ_IOC_MASK));
|
||||
readScatterGatherPoll(false);
|
||||
intrs = 1;
|
||||
} else {
|
||||
intrs = irqs[s2mmInterrupt].irqController->waitForInterrupt(
|
||||
|
@ -521,18 +652,17 @@ Dma::Completion Dma::readCompleteScatterGather() {
|
|||
c.interrupts = 0;
|
||||
return c;
|
||||
} else {
|
||||
hwLock.unlock();
|
||||
c.interrupts = intrs;
|
||||
}
|
||||
if (!polling) {
|
||||
irqStatus = XAxiDma_BdRingGetIrq(rxRing);
|
||||
XAxiDma_BdRingAckIrq(rxRing, irqStatus);
|
||||
if (!(irqStatus & XAXIDMA_IRQ_IOC_MASK)) {
|
||||
logger->error("Expected IOC interrupt but IRQ status is: {:#x}",
|
||||
irqStatus);
|
||||
return c;
|
||||
}
|
||||
}
|
||||
XAxiDma_BdRingAckIrq(rxRing, irqStatus);
|
||||
if (!(irqStatus & XAXIDMA_IRQ_IOC_MASK)) {
|
||||
logger->error("Expected IOC interrupt but IRQ status is: {:#x}", irqStatus);
|
||||
return c;
|
||||
}
|
||||
|
||||
// Wait until the data has been received by the RX channel.
|
||||
if ((c.bds = XAxiDma_BdRingFromHw(rxRing, readCoalesce, &bd)) <
|
||||
readCoalesce) {
|
||||
|
@ -564,7 +694,7 @@ Dma::Completion Dma::readCompleteScatterGather() {
|
|||
if ((ret & XAXIDMA_BD_STS_ALL_ERR_MASK) ||
|
||||
(!(ret & XAXIDMA_BD_STS_COMPLETE_MASK))) {
|
||||
hwLock.unlock();
|
||||
throw RuntimeError("Bd Status register shows error: {}", ret);
|
||||
throw RuntimeError("Read: Bd Status register shows error: {}", ret);
|
||||
}
|
||||
|
||||
c.bytes += XAxiDma_BdGetActualLength(bd, rxRing->MaxTransferLen);
|
||||
|
|
|
@ -43,9 +43,12 @@ bool Register::check() {
|
|||
}
|
||||
|
||||
// This is Dino specific for now - we should possibly move this to Dino in the future
|
||||
setRegister(0, static_cast<uint32_t>(1000)); // set Dino to a rate of 20 kHz
|
||||
setRegister(1, -0.001615254F);
|
||||
setRegister(2, 10.8061F);
|
||||
constexpr double dinoClk = 25e9; // Dino is clocked with 25 Mhz
|
||||
constexpr double sampleRate = 20e6; // We want to achieve a timestep of 50us
|
||||
constexpr uint32_t dinoTimerVal = static_cast<uint32_t>(dinoClk / sampleRate);
|
||||
setRegister(0, dinoTimerVal); // Timer value for generating ADC trigger signal
|
||||
setRegister(1, -0.001615254F); // Scale factor for ADC value
|
||||
setRegister(2, 10.8061F); // Offset for ADC value
|
||||
uint32_t rate = getRegister(0);
|
||||
float scale = getRegisterFloat(1);
|
||||
float offset = getRegisterFloat(2);
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <thread>
|
||||
#include <villas/format.hpp>
|
||||
#include <villas/node.hpp>
|
||||
#include <villas/node/config.hpp>
|
||||
|
@ -31,15 +32,31 @@ protected:
|
|||
std::string cardName;
|
||||
std::list<std::string> connectStrings;
|
||||
|
||||
// This setting improves latency by removing various checks.
|
||||
// Use with caution! Requires read cache in FPGA design!
|
||||
// The common use case in VILLASfpga is that we have exactly
|
||||
// one write for every read and the number of exchanged signals
|
||||
// do not change. If this is the case, we can reuse the buffer
|
||||
// descriptors during reads and write, thus avoidng freeing,
|
||||
// reallocating and setting them up.
|
||||
// We set up the descriptors in start, and in write or read,
|
||||
// we only reset the complete bit in the buffer descriptor and
|
||||
// write to the tdesc register to start the DMA transfer.
|
||||
// Improves read/write latency by approx. 40%.
|
||||
bool lowLatencyMode;
|
||||
|
||||
// State
|
||||
std::shared_ptr<fpga::Card> card;
|
||||
std::shared_ptr<villas::fpga::ip::Dma> dma;
|
||||
std::shared_ptr<villas::MemoryBlock> blockRx[2];
|
||||
std::shared_ptr<villas::MemoryBlock> blockRx;
|
||||
std::shared_ptr<villas::MemoryBlock> blockTx;
|
||||
|
||||
// Non-public methods
|
||||
virtual int fastRead(Sample *smps[], unsigned cnt);
|
||||
virtual int slowRead(Sample *smps[], unsigned cnt);
|
||||
virtual int _read(Sample *smps[], unsigned cnt) override;
|
||||
|
||||
virtual int fastWrite(Sample *smps[], unsigned cnt);
|
||||
virtual int slowWrite(Sample *smps[], unsigned cnt);
|
||||
virtual int _write(Sample *smps[], unsigned cnt) override;
|
||||
|
||||
public:
|
||||
|
@ -55,6 +72,8 @@ public:
|
|||
|
||||
virtual int start() override;
|
||||
|
||||
virtual int stop() override;
|
||||
|
||||
virtual std::vector<int> getPollFDs() override;
|
||||
|
||||
virtual const std::string &getDetails() override;
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unistd.h>
|
||||
#include <vector>
|
||||
|
||||
#include <jansson.h>
|
||||
|
@ -32,8 +33,8 @@ static std::list<std::shared_ptr<fpga::Card>> cards;
|
|||
static std::shared_ptr<kernel::vfio::Container> vfioContainer;
|
||||
|
||||
FpgaNode::FpgaNode(const uuid_t &id, const std::string &name)
|
||||
: Node(id, name), cardName(""), card(nullptr), dma(), blockRx(), blockTx() {
|
||||
}
|
||||
: Node(id, name), cardName(""), connectStrings(), lowLatencyMode(false),
|
||||
card(nullptr), dma(), blockRx(), blockTx() {}
|
||||
|
||||
FpgaNode::~FpgaNode() {}
|
||||
|
||||
|
@ -75,14 +76,12 @@ int FpgaNode::prepare() {
|
|||
|
||||
auto &alloc = HostRam::getAllocator();
|
||||
|
||||
blockRx[0] = alloc.allocateBlock(0x200 * sizeof(float));
|
||||
blockRx[1] = alloc.allocateBlock(0x200 * sizeof(float));
|
||||
blockRx = alloc.allocateBlock(0x200 * sizeof(float));
|
||||
blockTx = alloc.allocateBlock(0x200 * sizeof(float));
|
||||
villas::MemoryAccessor<float> memRx[] = {*(blockRx[0]), *(blockRx[1])};
|
||||
villas::MemoryAccessor<float> memRx = *blockRx;
|
||||
villas::MemoryAccessor<float> memTx = *blockTx;
|
||||
|
||||
dma->makeAccesibleFromVA(blockRx[0]);
|
||||
dma->makeAccesibleFromVA(blockRx[1]);
|
||||
dma->makeAccesibleFromVA(blockRx);
|
||||
dma->makeAccesibleFromVA(blockTx);
|
||||
|
||||
MemoryManager::get().printGraph();
|
||||
|
@ -90,6 +89,8 @@ int FpgaNode::prepare() {
|
|||
return Node::prepare();
|
||||
}
|
||||
|
||||
int FpgaNode::stop() { return Node::stop(); }
|
||||
|
||||
int FpgaNode::parse(json_t *json) {
|
||||
int ret = Node::parse(json);
|
||||
if (ret) {
|
||||
|
@ -105,8 +106,9 @@ int FpgaNode::parse(json_t *json) {
|
|||
vfioContainer = std::make_shared<kernel::vfio::Container>();
|
||||
}
|
||||
|
||||
ret = json_unpack_ex(json, &err, 0, "{ s: o, s?: o}", "card", &jsonCard,
|
||||
"connect", &jsonConnectStrings);
|
||||
ret = json_unpack_ex(json, &err, 0, "{ s: o, s?: o, s?: b, s?: b}", "card",
|
||||
&jsonCard, "connect", &jsonConnectStrings,
|
||||
"lowLatencyMode", &lowLatencyMode);
|
||||
if (ret) {
|
||||
throw ConfigError(json, err, "node-config-fpga",
|
||||
"Failed to parse configuration of node {}",
|
||||
|
@ -158,7 +160,8 @@ const std::string &FpgaNode::getDetails() {
|
|||
std::copy(connectStrings.begin(), connectStrings.end(),
|
||||
std::ostream_iterator<std::string>(imploded, delim));
|
||||
|
||||
details = fmt::format("fpga={}, connect={}", name, imploded.str());
|
||||
details = fmt::format("fpga={}, connect={}, lowLatencyMode={}", name,
|
||||
imploded.str(), lowLatencyMode);
|
||||
}
|
||||
|
||||
return details;
|
||||
|
@ -167,19 +170,99 @@ const std::string &FpgaNode::getDetails() {
|
|||
int FpgaNode::check() { return 0; }
|
||||
|
||||
int FpgaNode::start() {
|
||||
// enque first read
|
||||
// dma->read(*(blockRx[0]), blockRx[0]->getSize());
|
||||
if (getInputSignalsMaxCount() * sizeof(float) > blockRx->getSize()) {
|
||||
logger->error("Input signals exceed block size.");
|
||||
throw villas ::RuntimeError("Input signals exceed block size.");
|
||||
}
|
||||
if (lowLatencyMode) {
|
||||
dma->readScatterGatherPrepare(*blockRx, blockRx->getSize());
|
||||
if (getInputSignalsMaxCount() != 0) {
|
||||
dma->writeScatterGatherPrepare(*blockTx,
|
||||
getInputSignalsMaxCount() * sizeof(float));
|
||||
} else {
|
||||
logger->warn("No input signals defined. Not preparing write buffer - "
|
||||
"writes will not work.");
|
||||
}
|
||||
}
|
||||
|
||||
return Node::start();
|
||||
}
|
||||
|
||||
// We cannot modify the BD here, so writes are fixed length.
|
||||
// If fastWrite receives less signals than expected, the previous data
|
||||
// will be reused for the remaining signals
|
||||
int FpgaNode::fastWrite(Sample *smps[], unsigned cnt) {
|
||||
Sample *smp = smps[0];
|
||||
|
||||
assert(cnt == 1 && smps != nullptr && smps[0] != nullptr);
|
||||
|
||||
auto mem = MemoryAccessor<uint32_t>(*blockTx);
|
||||
float scaled;
|
||||
|
||||
for (unsigned i = 0; i < smp->length; i++) {
|
||||
if (smp->signals->getByIndex(i)->type == SignalType::FLOAT) {
|
||||
scaled = smp->data[i].f;
|
||||
if (scaled > 10.) {
|
||||
scaled = 10.;
|
||||
} else if (scaled < -10.) {
|
||||
scaled = -10.;
|
||||
}
|
||||
mem[i] = (scaled + 10.) * ((float)0xFFFF / 20.);
|
||||
} else {
|
||||
mem[i] = smp->data[i].i;
|
||||
}
|
||||
}
|
||||
|
||||
dma->writeScatterGatherFast();
|
||||
auto written = dma->writeScatterGatherPoll() /
|
||||
sizeof(float); // The number of samples written
|
||||
|
||||
if (written != smp->length) {
|
||||
logger->warn("Wrote {} samples, but {} were expected", written,
|
||||
smp->length);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Because we cannot modify the BD here, reads are fixed length.
|
||||
// However, if we receive less data than expected, we will return only
|
||||
// what we have received. fastRead is thus capable of partial reads.
|
||||
int FpgaNode::fastRead(Sample *smps[], unsigned cnt) {
|
||||
Sample *smp = smps[0];
|
||||
auto mem = MemoryAccessor<float>(*blockRx);
|
||||
|
||||
smp->flags = (int)SampleFlags::HAS_DATA;
|
||||
smp->signals = in.signals;
|
||||
|
||||
dma->readScatterGatherFast();
|
||||
auto read = dma->readScatterGatherPoll(true);
|
||||
// We assume a lot without checking at this point. All for the latency!
|
||||
|
||||
smp->length = 0;
|
||||
for (unsigned i = 0; i < MIN(read / sizeof(float), smp->capacity); i++) {
|
||||
smp->data[i].f = static_cast<double>(mem[i]);
|
||||
smp->length++;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int FpgaNode::_read(Sample *smps[], unsigned cnt) {
|
||||
static size_t cur = 0, next = 0;
|
||||
if (lowLatencyMode) {
|
||||
return fastRead(smps, cnt);
|
||||
} else {
|
||||
return slowRead(smps, cnt);
|
||||
}
|
||||
}
|
||||
|
||||
int FpgaNode::slowRead(Sample *smps[], unsigned cnt) {
|
||||
unsigned read;
|
||||
Sample *smp = smps[0];
|
||||
|
||||
assert(cnt == 1);
|
||||
|
||||
dma->read(*(blockRx[next]), blockRx[next]->getSize()); // TODO: calc size
|
||||
dma->read(*blockRx, blockRx->getSize());
|
||||
auto c = dma->readComplete();
|
||||
|
||||
read = c.bytes / sizeof(float);
|
||||
|
@ -188,7 +271,7 @@ int FpgaNode::_read(Sample *smps[], unsigned cnt) {
|
|||
logger->warn("Missed {} interrupts", c.interrupts - 1);
|
||||
}
|
||||
|
||||
auto mem = MemoryAccessor<float>(*(blockRx[cur]));
|
||||
auto mem = MemoryAccessor<float>(*blockRx);
|
||||
|
||||
smp->length = 0;
|
||||
for (unsigned i = 0; i < MIN(read, smp->capacity); i++) {
|
||||
|
@ -198,14 +281,20 @@ int FpgaNode::_read(Sample *smps[], unsigned cnt) {
|
|||
smp->flags = (int)SampleFlags::HAS_DATA;
|
||||
|
||||
smp->signals = in.signals;
|
||||
//cur = next;
|
||||
//next = (next + 1) % (sizeof(blockRx) / sizeof(blockRx[0]));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int FpgaNode::_write(Sample *smps[], unsigned cnt) {
|
||||
unsigned int written;
|
||||
if (lowLatencyMode) {
|
||||
return fastWrite(smps, cnt);
|
||||
} else {
|
||||
return slowWrite(smps, cnt);
|
||||
}
|
||||
}
|
||||
|
||||
int FpgaNode::slowWrite(Sample *smps[], unsigned cnt) {
|
||||
// unsigned int written;
|
||||
Sample *smp = smps[0];
|
||||
|
||||
assert(cnt == 1 && smps != nullptr && smps[0] != nullptr);
|
||||
|
@ -224,11 +313,11 @@ int FpgaNode::_write(Sample *smps[], unsigned cnt) {
|
|||
}
|
||||
|
||||
bool state = dma->write(*blockTx, smp->length * sizeof(float));
|
||||
if (!state)
|
||||
if (!state) {
|
||||
return -1;
|
||||
|
||||
written = dma->writeComplete().bytes /
|
||||
sizeof(float); // The number of samples written
|
||||
}
|
||||
auto written = dma->writeComplete().bytes /
|
||||
sizeof(float); // The number of samples written
|
||||
|
||||
if (written != smp->length) {
|
||||
logger->warn("Wrote {} samples, but {} were expected", written,
|
||||
|
|
Loading…
Add table
Reference in a new issue