mirror of
https://git.rwth-aachen.de/acs/public/villas/node/
synced 2025-03-09 00:00:00 +01:00
fpga: DMA: poll BD instead of hardware register
polling HW is slow (>1us). Polling RAM is faster. This is a first implementation which only polls the first BD that is active. This is why this commit also removes the second read in nodes/fpga. This is not really useful anyways. Signed-off-by: Niklas Eiling <niklas.eiling@eonerc.rwth-aachen.de>
This commit is contained in:
parent
2f0a10c49b
commit
d9b3bdb0de
2 changed files with 66 additions and 18 deletions
|
@ -57,11 +57,12 @@ bool Dma::init() {
|
|||
setupScatterGather();
|
||||
}
|
||||
|
||||
irqs[mm2sInterrupt].irqController->enableInterrupt(irqs[mm2sInterrupt],
|
||||
polling);
|
||||
irqs[s2mmInterrupt].irqController->enableInterrupt(irqs[s2mmInterrupt],
|
||||
polling);
|
||||
|
||||
if (!polling) {
|
||||
irqs[mm2sInterrupt].irqController->enableInterrupt(irqs[mm2sInterrupt],
|
||||
polling);
|
||||
irqs[s2mmInterrupt].irqController->enableInterrupt(irqs[s2mmInterrupt],
|
||||
polling);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -341,9 +342,10 @@ bool Dma::writeScatterGather(const void *buf, size_t len) {
|
|||
bool Dma::readScatterGather(void *buf, size_t len) {
|
||||
int ret = XST_FAILURE;
|
||||
|
||||
if (len < readCoalesce * readMsgSize)
|
||||
if (len < readCoalesce * readMsgSize) {
|
||||
throw RuntimeError(
|
||||
"Read size is smaller than readCoalesce*msgSize. Cannot setup BDs.");
|
||||
}
|
||||
|
||||
hwLock.lock();
|
||||
auto *rxRing = XAxiDma_GetRxRing(&xDma);
|
||||
|
@ -404,10 +406,29 @@ Dma::Completion Dma::writeCompleteScatterGather() {
|
|||
int ret = XST_FAILURE;
|
||||
static size_t errcnt = 32;
|
||||
|
||||
c.interrupts = irqs[mm2sInterrupt].irqController->waitForInterrupt(
|
||||
irqs[mm2sInterrupt].num);
|
||||
uint32_t irqStatus = 0;
|
||||
if (polling) {
|
||||
hwLock.lock();
|
||||
XAxiDma_Bd *CurBdPtr = txRing->HwHead;
|
||||
volatile uint32_t BdSts;
|
||||
// Poll BD status to avoid accessing PCIe address space
|
||||
do {
|
||||
BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
|
||||
} while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
|
||||
// At this point, we know that the transmission is complete, but we haven't accessed the
|
||||
// PCIe address space, yet. The subsequent DMA Controller management can be done in a
|
||||
// separate thread to keep latencies in this thread extremly low. We know that we have
|
||||
// received one BD.
|
||||
do {
|
||||
// This takes 1.5 us
|
||||
irqStatus = XAxiDma_BdRingGetIrq(txRing);
|
||||
} while (!(irqStatus & XAXIDMA_IRQ_IOC_MASK));
|
||||
} else {
|
||||
c.interrupts = irqs[mm2sInterrupt].irqController->waitForInterrupt(
|
||||
irqs[mm2sInterrupt].num);
|
||||
hwLock.lock();
|
||||
}
|
||||
|
||||
hwLock.lock();
|
||||
if ((c.bds = XAxiDma_BdRingFromHw(txRing, writeCoalesce, &bd)) <
|
||||
writeCoalesce) {
|
||||
logger->warn("Send partial batch of {}/{} BDs.", c.bds, writeCoalesce);
|
||||
|
@ -418,7 +439,9 @@ Dma::Completion Dma::writeCompleteScatterGather() {
|
|||
}
|
||||
|
||||
// Acknowledge the interrupt
|
||||
auto irqStatus = XAxiDma_BdRingGetIrq(txRing);
|
||||
if (!polling) {
|
||||
irqStatus = XAxiDma_BdRingGetIrq(txRing);
|
||||
}
|
||||
XAxiDma_BdRingAckIrq(txRing, irqStatus);
|
||||
|
||||
if (c.bds == 0) {
|
||||
|
@ -462,8 +485,30 @@ Dma::Completion Dma::readCompleteScatterGather() {
|
|||
int ret = XST_FAILURE;
|
||||
static size_t errcnt = 32;
|
||||
|
||||
ssize_t intrs = irqs[s2mmInterrupt].irqController->waitForInterrupt(
|
||||
irqs[s2mmInterrupt].num);
|
||||
ssize_t intrs = 0;
|
||||
uint32_t irqStatus = 0;
|
||||
if (polling) {
|
||||
hwLock.lock();
|
||||
XAxiDma_Bd *CurBdPtr = rxRing->HwHead;
|
||||
volatile uint32_t BdSts;
|
||||
// Poll BD status to avoid accessing PCIe address space
|
||||
do {
|
||||
BdSts = XAxiDma_ReadReg((UINTPTR)CurBdPtr, XAXIDMA_BD_STS_OFFSET);
|
||||
} while (!(BdSts & XAXIDMA_BD_STS_COMPLETE_MASK));
|
||||
// At this point, we know that the transmission is complete, but we haven't accessed the
|
||||
// PCIe address space, yet. The subsequent DMA Controller management can be done in a
|
||||
// separate thread to keep latencies in this thread extremly low. We know that we have
|
||||
// received one BD.
|
||||
do {
|
||||
// This takes 1.5 us
|
||||
irqStatus = XAxiDma_BdRingGetIrq(rxRing);
|
||||
} while (!(irqStatus & XAXIDMA_IRQ_IOC_MASK));
|
||||
intrs = 1;
|
||||
} else {
|
||||
intrs = irqs[s2mmInterrupt].irqController->waitForInterrupt(
|
||||
irqs[s2mmInterrupt].num);
|
||||
hwLock.lock();
|
||||
}
|
||||
|
||||
if (intrs < 0) {
|
||||
logger->warn("Interrupt error or timeout: {}", intrs);
|
||||
|
@ -471,14 +516,17 @@ Dma::Completion Dma::readCompleteScatterGather() {
|
|||
// Free all RX BDs for future transmission.
|
||||
int bds = XAxiDma_BdRingFromHw(rxRing, XAXIDMA_ALL_BDS, &bd);
|
||||
XAxiDma_BdRingFree(rxRing, bds, bd);
|
||||
hwLock.unlock();
|
||||
|
||||
c.interrupts = 0;
|
||||
return c;
|
||||
} else {
|
||||
hwLock.unlock();
|
||||
c.interrupts = intrs;
|
||||
}
|
||||
hwLock.lock();
|
||||
auto irqStatus = XAxiDma_BdRingGetIrq(rxRing);
|
||||
if (!polling) {
|
||||
irqStatus = XAxiDma_BdRingGetIrq(rxRing);
|
||||
}
|
||||
XAxiDma_BdRingAckIrq(rxRing, irqStatus);
|
||||
if (!(irqStatus & XAXIDMA_IRQ_IOC_MASK)) {
|
||||
logger->error("Expected IOC interrupt but IRQ status is: {:#x}", irqStatus);
|
||||
|
|
|
@ -169,12 +169,12 @@ int FpgaNode::check() { return 0; }
|
|||
|
||||
int FpgaNode::start() {
|
||||
// enque first read
|
||||
dma->read(*(blockRx[0]), blockRx[0]->getSize());
|
||||
// dma->read(*(blockRx[0]), blockRx[0]->getSize());
|
||||
return Node::start();
|
||||
}
|
||||
|
||||
int FpgaNode::_read(Sample *smps[], unsigned cnt) {
|
||||
static size_t cur = 0, next = 1;
|
||||
static size_t cur = 0, next = 0;
|
||||
unsigned read;
|
||||
Sample *smp = smps[0];
|
||||
|
||||
|
@ -199,8 +199,8 @@ int FpgaNode::_read(Sample *smps[], unsigned cnt) {
|
|||
smp->flags = (int)SampleFlags::HAS_DATA;
|
||||
|
||||
smp->signals = in.signals;
|
||||
cur = next;
|
||||
next = (next + 1) % (sizeof(blockRx) / sizeof(blockRx[0]));
|
||||
//cur = next;
|
||||
//next = (next + 1) % (sizeof(blockRx) / sizeof(blockRx[0]));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue