mirror of
https://git.rwth-aachen.de/acs/public/villas/node/
synced 2025-03-09 00:00:00 +01:00
Merge branch 'feature/gpu' into 'develop'
Add basic GPU/CUDA integration as a shared library See merge request acs/public/villas/VILLASfpga-code!9
This commit is contained in:
commit
929ed2f393
33 changed files with 1577 additions and 145 deletions
6
fpga/.gitmodules
vendored
6
fpga/.gitmodules
vendored
|
@ -4,3 +4,9 @@
|
|||
[submodule "thirdparty/libxil"]
|
||||
path = thirdparty/libxil
|
||||
url = https://git.rwth-aachen.de/acs/public/villas/libxil.git
|
||||
[submodule "lib/gpu/gdrcopy"]
|
||||
path = lib/gpu/gdrcopy
|
||||
url = https://github.com/daniel-k/gdrcopy.git
|
||||
[submodule "thirdparty/udmabuf"]
|
||||
path = thirdparty/udmabuf
|
||||
url = https://github.com/ikwzm/udmabuf
|
||||
|
|
|
@ -120,30 +120,30 @@
|
|||
"M_AXI_MM2S": {
|
||||
"pcie_0_axi_pcie_0": {
|
||||
"BAR0": {
|
||||
"baseaddr": 2147483648,
|
||||
"baseaddr": 0,
|
||||
"highaddr": 4294967295,
|
||||
"size": 2147483648
|
||||
"size": 4294967296
|
||||
}
|
||||
}
|
||||
},
|
||||
"M_AXI_S2MM": {
|
||||
"pcie_0_axi_pcie_0": {
|
||||
"BAR0": {
|
||||
"baseaddr": 2147483648,
|
||||
"baseaddr": 0,
|
||||
"highaddr": 4294967295,
|
||||
"size": 2147483648
|
||||
"size": 4294967296
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"ports": [
|
||||
{
|
||||
"role": "initiator",
|
||||
"role": "master",
|
||||
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:1",
|
||||
"name": "MM2S"
|
||||
},
|
||||
{
|
||||
"role": "target",
|
||||
"role": "slave",
|
||||
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:1",
|
||||
"name": "S2MM"
|
||||
}
|
||||
|
@ -159,30 +159,30 @@
|
|||
"M_AXI_MM2S": {
|
||||
"pcie_0_axi_pcie_0": {
|
||||
"BAR0": {
|
||||
"baseaddr": 2147483648,
|
||||
"baseaddr": 0,
|
||||
"highaddr": 4294967295,
|
||||
"size": 2147483648
|
||||
"size": 4294967296
|
||||
}
|
||||
}
|
||||
},
|
||||
"M_AXI_S2MM": {
|
||||
"pcie_0_axi_pcie_0": {
|
||||
"BAR0": {
|
||||
"baseaddr": 2147483648,
|
||||
"baseaddr": 0,
|
||||
"highaddr": 4294967295,
|
||||
"size": 2147483648
|
||||
"size": 4294967296
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"ports": [
|
||||
{
|
||||
"role": "initiator",
|
||||
"role": "master",
|
||||
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:6",
|
||||
"name": "MM2S"
|
||||
},
|
||||
{
|
||||
"role": "target",
|
||||
"role": "slave",
|
||||
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:6",
|
||||
"name": "S2MM"
|
||||
}
|
||||
|
@ -214,22 +214,22 @@
|
|||
"vlnv": "xilinx.com:ip:axis_switch:1.1",
|
||||
"ports": [
|
||||
{
|
||||
"role": "initiator",
|
||||
"role": "master",
|
||||
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:3",
|
||||
"name": "M03_AXIS"
|
||||
},
|
||||
{
|
||||
"role": "target",
|
||||
"role": "slave",
|
||||
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:3",
|
||||
"name": "S03_AXIS"
|
||||
},
|
||||
{
|
||||
"role": "initiator",
|
||||
"role": "master",
|
||||
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:4",
|
||||
"name": "M04_AXIS"
|
||||
},
|
||||
{
|
||||
"role": "target",
|
||||
"role": "slave",
|
||||
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:4",
|
||||
"name": "S04_AXIS"
|
||||
}
|
||||
|
@ -237,7 +237,7 @@
|
|||
"num_ports": 7
|
||||
},
|
||||
"hier_0_hls_dft_0": {
|
||||
"vlnv": "acs.eonerc.rwth-aachen.de:hls:hls_dft:1.0",
|
||||
"vlnv": "acs.eonerc.rwth-aachen.de:hls:hls_dft:1.1",
|
||||
"ports": [
|
||||
{
|
||||
"role": "master",
|
||||
|
@ -249,7 +249,10 @@
|
|||
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:5",
|
||||
"name": "input_r"
|
||||
}
|
||||
]
|
||||
],
|
||||
"irqs": {
|
||||
"interrupt": "pcie_0_axi_pcie_intc_0:1"
|
||||
}
|
||||
},
|
||||
"hier_0_rtds_axis_0": {
|
||||
"vlnv": "acs.eonerc.rwth-aachen.de:user:rtds_axis:1.0",
|
||||
|
@ -358,6 +361,19 @@
|
|||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"axi_bars": {
|
||||
"BAR0": {
|
||||
"translation": 0,
|
||||
"baseaddr": 0,
|
||||
"highaddr": 4294967295,
|
||||
"size": 4294967296
|
||||
}
|
||||
},
|
||||
"pcie_bars": {
|
||||
"BAR0": {
|
||||
"translation": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"pcie_0_axi_pcie_intc_0": {
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
|
||||
|
@ -58,6 +59,12 @@ public:
|
|||
operator==(const Edge& other)
|
||||
{ return this->id == other.id; }
|
||||
|
||||
Vertex::Identifier getVertexTo() const
|
||||
{ return to; }
|
||||
|
||||
Vertex::Identifier getVertexFrom() const
|
||||
{ return from; }
|
||||
|
||||
private:
|
||||
Identifier id;
|
||||
Vertex::Identifier from;
|
||||
|
@ -81,7 +88,7 @@ public:
|
|||
|
||||
std::shared_ptr<VertexType> getVertex(VertexIdentifier vertexId) const
|
||||
{
|
||||
if(vertexId < 0 or vertexId >= lastVertexId)
|
||||
if(vertexId >= lastVertexId)
|
||||
throw std::invalid_argument("vertex doesn't exist");
|
||||
|
||||
// cannot use [] operator, because creates non-existing elements
|
||||
|
@ -92,7 +99,10 @@ public:
|
|||
template<class UnaryPredicate>
|
||||
VertexIdentifier findVertex(UnaryPredicate p)
|
||||
{
|
||||
for(auto& [vertexId, vertex] : vertices) {
|
||||
for(auto& v : vertices) {
|
||||
auto& vertexId = v.first;
|
||||
auto& vertex = v.second;
|
||||
|
||||
if(p(vertex)) {
|
||||
return vertexId;
|
||||
}
|
||||
|
@ -103,7 +113,7 @@ public:
|
|||
|
||||
std::shared_ptr<EdgeType> getEdge(EdgeIdentifier edgeId) const
|
||||
{
|
||||
if(edgeId < 0 or edgeId >= lastEdgeId)
|
||||
if(edgeId >= lastEdgeId)
|
||||
throw std::invalid_argument("edge doesn't exist");
|
||||
|
||||
// cannot use [] operator, because creates non-existing elements
|
||||
|
@ -177,7 +187,9 @@ public:
|
|||
// delete every edge that start or ends at this vertex
|
||||
auto it = edges.begin();
|
||||
while(it != edges.end()) {
|
||||
auto& [edgeId, edge] = *it;
|
||||
auto& edgeId = it->first;
|
||||
auto& edge = it->second;
|
||||
|
||||
bool removeEdge = false;
|
||||
|
||||
if(edge->to == vertexId) {
|
||||
|
@ -207,9 +219,17 @@ public:
|
|||
vertexGetEdges(VertexIdentifier vertexId) const
|
||||
{ return getVertex(vertexId)->edges; }
|
||||
|
||||
|
||||
using check_path_fn = std::function<bool(const Path&)>;
|
||||
|
||||
static bool
|
||||
checkPath(const Path&)
|
||||
{ return true; }
|
||||
|
||||
bool getPath(VertexIdentifier fromVertexId,
|
||||
VertexIdentifier toVertexId,
|
||||
Path& path)
|
||||
Path& path,
|
||||
check_path_fn pathCheckFunc = checkPath)
|
||||
{
|
||||
if(fromVertexId == toVertexId) {
|
||||
// arrived at the destination
|
||||
|
@ -239,7 +259,8 @@ public:
|
|||
path.push_back(edgeId);
|
||||
|
||||
// recursive, depth-first search
|
||||
if(getPath(edgeOfFromVertex->to, toVertexId, path)) {
|
||||
if(getPath(edgeOfFromVertex->to, toVertexId, path, pathCheckFunc) and
|
||||
pathCheckFunc(path)) {
|
||||
// path found, we're done
|
||||
return true;
|
||||
} else {
|
||||
|
@ -252,11 +273,11 @@ public:
|
|||
return false;
|
||||
}
|
||||
|
||||
void dump()
|
||||
void dump(const std::string& fileName = "")
|
||||
{
|
||||
logger->info("Vertices:");
|
||||
for(auto& [vertexId, vertex] : vertices) {
|
||||
(void) vertexId;
|
||||
for(auto& v : vertices) {
|
||||
auto& vertex = v.second;
|
||||
|
||||
// format connected vertices into a list
|
||||
std::stringstream ssEdges;
|
||||
|
@ -267,11 +288,29 @@ public:
|
|||
logger->info(" {} connected to: {}", *vertex, ssEdges.str());
|
||||
}
|
||||
|
||||
std::fstream s(fileName, s.out | s.trunc);
|
||||
if(s.is_open()) {
|
||||
s << "digraph memgraph {" << std::endl;
|
||||
}
|
||||
|
||||
logger->info("Edges:");
|
||||
for(auto& [edgeId, edge] : edges) {
|
||||
(void) edgeId;
|
||||
for(auto& e : edges) {
|
||||
auto& edge = e.second;
|
||||
|
||||
logger->info(" {}: {} -> {}", *edge, edge->from, edge->to);
|
||||
if(s.is_open()) {
|
||||
auto from = getVertex(edge->from);
|
||||
auto to = getVertex(edge->to);
|
||||
|
||||
s << std::dec;
|
||||
s << " \"" << *from << "\" -> \"" << *to << "\""
|
||||
<< " [label=\"" << *edge << "\"];" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(s.is_open()) {
|
||||
s << "}" << std::endl;
|
||||
s.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -98,8 +98,9 @@ public: // TODO: make this private
|
|||
|
||||
std::string name; /**< The name of the FPGA card */
|
||||
|
||||
struct pci *pci;
|
||||
struct pci* pci;
|
||||
struct pci_device filter; /**< Filter for PCI device. */
|
||||
struct pci_device* pdev; /**< PCI device handle */
|
||||
|
||||
/// The VFIO container that this card is part of
|
||||
std::shared_ptr<VfioContainer> vfioContainer;
|
||||
|
|
|
@ -193,6 +193,10 @@ protected:
|
|||
InterruptController*
|
||||
getInterruptController(const std::string& interruptName) const;
|
||||
|
||||
MemoryManager::AddressSpaceId
|
||||
getMasterAddrSpaceByInterface(const std::string& masterInterfaceName) const
|
||||
{ return busMasterInterfaces.at(masterInterfaceName); }
|
||||
|
||||
protected:
|
||||
struct IrqPort {
|
||||
int num;
|
||||
|
|
|
@ -55,7 +55,9 @@ public:
|
|||
bool readComplete()
|
||||
{ return hasScatterGather() ? readCompleteSG() : readCompleteSimple(); }
|
||||
|
||||
bool pingPong(const MemoryBlock& src, const MemoryBlock& dst, size_t len);
|
||||
bool memcpy(const MemoryBlock& src, const MemoryBlock& dst, size_t len);
|
||||
|
||||
bool makeAccesibleFromVA(const MemoryBlock& mem);
|
||||
|
||||
inline bool
|
||||
hasScatterGather() const
|
||||
|
@ -72,6 +74,8 @@ private:
|
|||
bool writeCompleteSimple();
|
||||
bool readCompleteSimple();
|
||||
|
||||
bool isMemoryBlockAccesible(const MemoryBlock& mem, const std::string& interface);
|
||||
|
||||
private:
|
||||
static constexpr char registerMemory[] = "Reg";
|
||||
|
||||
|
|
|
@ -52,6 +52,19 @@ public:
|
|||
private:
|
||||
static constexpr char axiInterface[] = "M_AXI";
|
||||
static constexpr char pcieMemory[] = "BAR0";
|
||||
|
||||
struct AxiBar {
|
||||
uintptr_t base;
|
||||
size_t size;
|
||||
uintptr_t translation;
|
||||
};
|
||||
|
||||
struct PciBar {
|
||||
uintptr_t translation;
|
||||
};
|
||||
|
||||
std::map<std::string, AxiBar> axiToPcieTranslations;
|
||||
std::map<std::string, PciBar> pcieToAxiTranslations;
|
||||
};
|
||||
|
||||
|
||||
|
@ -64,6 +77,8 @@ public:
|
|||
getCompatibleVlnvString()
|
||||
{ return "xilinx.com:ip:axi_pcie:"; }
|
||||
|
||||
bool configureJson(IpCore& ip, json_t *json_ip);
|
||||
|
||||
IpCore* create()
|
||||
{ return new AxiPciExpressBridge; }
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include "list.h"
|
||||
|
||||
#define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f)
|
||||
|
@ -33,6 +34,13 @@ struct pci_device {
|
|||
} slot; /**< Bus, Device, Function (BDF) */
|
||||
};
|
||||
|
||||
struct pci_region {
|
||||
int num;
|
||||
uintptr_t start;
|
||||
uintptr_t end;
|
||||
unsigned long long flags;
|
||||
};
|
||||
|
||||
struct pci {
|
||||
struct list devices; /**< List of available PCI devices in the system (struct pci_device) */
|
||||
};
|
||||
|
@ -66,6 +74,8 @@ int pci_attach_driver(const struct pci_device *d, const char *driver);
|
|||
/** Return the IOMMU group of this PCI device or -1 if the device is not in a group. */
|
||||
int pci_get_iommu_group(const struct pci_device *d);
|
||||
|
||||
size_t pci_get_regions(const struct pci_device *d, struct pci_region** regions);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -18,7 +18,8 @@
|
|||
#include <linux/vfio.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
#define VFIO_DEV(x) "/dev/vfio/" x
|
||||
#define VFIO_PATH "/dev/vfio/"
|
||||
#define VFIO_DEV VFIO_PATH "vfio"
|
||||
|
||||
/* Forward declarations */
|
||||
struct pci_device;
|
||||
|
@ -90,7 +91,7 @@ public:
|
|||
~VfioGroup();
|
||||
|
||||
static std::unique_ptr<VfioGroup>
|
||||
attach(int containerFd, int groupIndex);
|
||||
attach(VfioContainer& container, int groupIndex);
|
||||
|
||||
private:
|
||||
/// VFIO group file descriptor
|
||||
|
@ -135,6 +136,12 @@ public:
|
|||
/** munmap() a region which has been mapped by vfio_map_region() */
|
||||
bool memoryUnmap(uintptr_t phys, size_t length);
|
||||
|
||||
bool isIommuEnabled() const
|
||||
{ return this->hasIommu; }
|
||||
|
||||
const int& getFd() const
|
||||
{ return fd; }
|
||||
|
||||
private:
|
||||
VfioGroup& getOrAttachGroup(int index);
|
||||
|
||||
|
@ -143,6 +150,7 @@ private:
|
|||
int version;
|
||||
int extensions;
|
||||
uint64_t iova_next; /**< Next free IOVA address */
|
||||
bool hasIommu;
|
||||
|
||||
/// All groups bound to this container
|
||||
std::list<std::unique_ptr<VfioGroup>> groups;
|
||||
|
|
|
@ -116,6 +116,8 @@ public:
|
|||
free = [&](MemoryBlock* mem) {
|
||||
logger->warn("no free callback defined for addr space {}, not freeing",
|
||||
mem->getAddrSpaceId());
|
||||
|
||||
removeMemoryBlock(*mem);
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -128,6 +130,21 @@ public:
|
|||
{
|
||||
const size_t size = num * sizeof(T);
|
||||
auto mem = allocateBlock(size);
|
||||
|
||||
// Check if the allocated memory is really accessible by writing to the
|
||||
// allocated memory and reading back. Exponentially increase offset to
|
||||
// speed up testing.
|
||||
MemoryAccessor<volatile uint8_t> byteAccessor(*mem);
|
||||
size_t idx = 0;
|
||||
for(int i = 0; idx < mem->getSize(); i++, idx = (1 << i)) {
|
||||
auto val = static_cast<uint8_t>(i);
|
||||
byteAccessor[idx] = val;
|
||||
if(byteAccessor[idx] != val) {
|
||||
logger->error("Cannot access allocated memory");
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
}
|
||||
|
||||
return MemoryAccessor<T>(std::move(mem));
|
||||
}
|
||||
|
||||
|
@ -179,6 +196,9 @@ public:
|
|||
size_t getAvailableMemory() const
|
||||
{ return memorySize - nextFreeAddress; }
|
||||
|
||||
size_t getSize() const
|
||||
{ return memorySize; }
|
||||
|
||||
std::string getName() const;
|
||||
|
||||
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
|
||||
|
@ -225,4 +245,41 @@ private:
|
|||
static HostRamAllocator allocator;
|
||||
};
|
||||
|
||||
|
||||
class HostDmaRam {
|
||||
private:
|
||||
|
||||
static std::string
|
||||
getUdmaBufName(int num);
|
||||
|
||||
static std::string
|
||||
getUdmaBufBasePath(int num);
|
||||
|
||||
static size_t
|
||||
getUdmaBufBufSize(int num);
|
||||
|
||||
static uintptr_t
|
||||
getUdmaBufPhysAddr(int num);
|
||||
|
||||
public:
|
||||
class HostDmaRamAllocator : public LinearAllocator {
|
||||
public:
|
||||
HostDmaRamAllocator(int num);
|
||||
|
||||
virtual ~HostDmaRamAllocator();
|
||||
|
||||
std::string getName() const
|
||||
{ return getUdmaBufName(num); }
|
||||
|
||||
private:
|
||||
int num;
|
||||
};
|
||||
|
||||
static HostDmaRamAllocator&
|
||||
getAllocator(int num = 0);
|
||||
|
||||
private:
|
||||
static std::map<int, std::unique_ptr<HostDmaRamAllocator>> allocators;
|
||||
};
|
||||
|
||||
} // namespace villas
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <stdexcept>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "log.hpp"
|
||||
|
@ -73,7 +74,12 @@ private:
|
|||
// This is a singleton, so private constructor ...
|
||||
MemoryManager() :
|
||||
memoryGraph("MemoryGraph"),
|
||||
logger(loggerGetOrCreate("MemoryManager")) {}
|
||||
logger(loggerGetOrCreate("MemoryManager"))
|
||||
{
|
||||
pathCheckFunc = [&](const MemoryGraph::Path& path) {
|
||||
return this->pathCheck(path);
|
||||
};
|
||||
}
|
||||
|
||||
// ... and no copying or assigning
|
||||
MemoryManager(const MemoryManager&) = delete;
|
||||
|
@ -109,7 +115,7 @@ private:
|
|||
return stream << static_cast<const Edge&>(mapping) << " = "
|
||||
<< mapping.name
|
||||
<< std::hex
|
||||
<< "(src=0x" << mapping.src
|
||||
<< " (src=0x" << mapping.src
|
||||
<< ", dest=0x" << mapping.dest
|
||||
<< ", size=0x" << mapping.size
|
||||
<< ")";
|
||||
|
@ -144,6 +150,8 @@ public:
|
|||
using AddressSpaceId = MemoryGraph::VertexIdentifier;
|
||||
using MappingId = MemoryGraph::EdgeIdentifier;
|
||||
|
||||
struct InvalidTranslation : public std::exception {};
|
||||
|
||||
/// Get singleton instance
|
||||
static MemoryManager&
|
||||
get();
|
||||
|
@ -152,6 +160,10 @@ public:
|
|||
getProcessAddressSpace()
|
||||
{ return getOrCreateAddressSpace("villas-fpga"); }
|
||||
|
||||
AddressSpaceId
|
||||
getPciAddressSpace()
|
||||
{ return getOrCreateAddressSpace("PCIe"); }
|
||||
|
||||
AddressSpaceId
|
||||
getProcessAddressSpaceMemoryBlock(const std::string& memoryBlock)
|
||||
{ return getOrCreateAddressSpace(getSlaveAddrSpaceName("villas-fpga", memoryBlock)); }
|
||||
|
@ -184,6 +196,9 @@ public:
|
|||
AddressSpaceId
|
||||
findAddressSpace(const std::string& name);
|
||||
|
||||
std::list<AddressSpaceId>
|
||||
findPath(AddressSpaceId fromAddrSpaceId, AddressSpaceId toAddrSpaceId);
|
||||
|
||||
MemoryTranslation
|
||||
getTranslation(AddressSpaceId fromAddrSpaceId, AddressSpaceId toAddrSpaceId);
|
||||
|
||||
|
@ -203,6 +218,9 @@ public:
|
|||
dump()
|
||||
{ memoryGraph.dump(); }
|
||||
|
||||
void
|
||||
dumpToFile(const std::string& fileName)
|
||||
{ memoryGraph.dump(fileName); }
|
||||
|
||||
private:
|
||||
/// Convert a Mapping to MemoryTranslation for calculations
|
||||
|
@ -210,6 +228,8 @@ private:
|
|||
getTranslationFromMapping(const Mapping& mapping)
|
||||
{ return MemoryTranslation(mapping.src, mapping.dest, mapping.size); }
|
||||
|
||||
bool
|
||||
pathCheck(const MemoryGraph::Path& path);
|
||||
|
||||
private:
|
||||
/// Directed graph that stores address spaces and memory mappings
|
||||
|
@ -221,6 +241,8 @@ private:
|
|||
/// Logger for universal access in this class
|
||||
SpdLogger logger;
|
||||
|
||||
MemoryGraph::check_path_fn pathCheckFunc;
|
||||
|
||||
/// Static pointer to global instance, because this is a singleton
|
||||
static MemoryManager* instance;
|
||||
};
|
||||
|
|
|
@ -40,6 +40,7 @@ public:
|
|||
Unknown,
|
||||
FpgaIp,
|
||||
FpgaCard,
|
||||
Gpu
|
||||
};
|
||||
|
||||
Plugin(Type type, const std::string& name);
|
||||
|
|
|
@ -10,6 +10,15 @@ namespace utils {
|
|||
std::vector<std::string>
|
||||
tokenize(std::string s, std::string delimiter);
|
||||
|
||||
|
||||
template<typename T>
|
||||
void
|
||||
assertExcept(bool condition, const T& exception)
|
||||
{
|
||||
if(not condition)
|
||||
throw exception;
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
} // namespace villas
|
||||
|
||||
|
|
|
@ -36,6 +36,16 @@ add_library(villas-fpga SHARED ${SOURCES})
|
|||
|
||||
target_link_libraries(villas-fpga PUBLIC villas-common)
|
||||
|
||||
# GPU library is optional, check for CUDA presence
|
||||
include(CheckLanguage)
|
||||
check_language(CUDA)
|
||||
if(CMAKE_CUDA_COMPILER)
|
||||
add_subdirectory(gpu)
|
||||
target_link_libraries(villas-fpga PUBLIC villas-gpu)
|
||||
else()
|
||||
message("No CUDA support, not building GPU library")
|
||||
endif()
|
||||
|
||||
target_compile_definitions(villas-fpga PRIVATE
|
||||
BUILDID=\"abc\"
|
||||
_GNU_SOURCE
|
||||
|
|
|
@ -165,6 +165,11 @@ PCIeCard::lookupIp(const Vlnv& vlnv) const
|
|||
bool
|
||||
PCIeCard::mapMemoryBlock(const MemoryBlock& block)
|
||||
{
|
||||
if(not vfioContainer->isIommuEnabled()) {
|
||||
logger->warn("VFIO mapping not supported without IOMMU");
|
||||
return false;
|
||||
}
|
||||
|
||||
auto& mm = MemoryManager::get();
|
||||
const auto& addrSpaceId = block.getAddrSpaceId();
|
||||
|
||||
|
@ -175,7 +180,6 @@ PCIeCard::mapMemoryBlock(const MemoryBlock& block)
|
|||
logger->debug("Create VFIO mapping for {}", addrSpaceId);
|
||||
}
|
||||
|
||||
|
||||
auto translationFromProcess = mm.getTranslationFromProcess(addrSpaceId);
|
||||
uintptr_t processBaseAddr = translationFromProcess.getLocalAddr(0);
|
||||
uintptr_t iovaAddr = vfioContainer->memoryMap(processBaseAddr,
|
||||
|
@ -188,10 +192,8 @@ PCIeCard::mapMemoryBlock(const MemoryBlock& block)
|
|||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
mm.createMapping(iovaAddr, 0, block.getSize(),
|
||||
"vfio",
|
||||
"VFIO-D2H",
|
||||
this->addrSpaceIdDeviceToHost,
|
||||
addrSpaceId);
|
||||
|
||||
|
@ -203,11 +205,8 @@ PCIeCard::mapMemoryBlock(const MemoryBlock& block)
|
|||
|
||||
|
||||
bool
|
||||
fpga::PCIeCard::init()
|
||||
PCIeCard::init()
|
||||
{
|
||||
struct pci_device *pdev;
|
||||
|
||||
auto& mm = MemoryManager::get();
|
||||
logger = getLogger();
|
||||
|
||||
logger->info("Initializing FPGA card {}", name);
|
||||
|
@ -223,41 +222,12 @@ fpga::PCIeCard::init()
|
|||
VfioDevice& device = vfioContainer->attachDevice(pdev);
|
||||
this->vfioDevice = &device;
|
||||
|
||||
|
||||
/* Enable memory access and PCI bus mastering for DMA */
|
||||
if (not device.pciEnable()) {
|
||||
logger->error("Failed to enable PCI device");
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Map PCIe BAR */
|
||||
const void* bar0_mapped = vfioDevice->regionMap(VFIO_PCI_BAR0_REGION_INDEX);
|
||||
if (bar0_mapped == MAP_FAILED) {
|
||||
logger->error("Failed to mmap() BAR0");
|
||||
return false;
|
||||
}
|
||||
|
||||
// determine size of BAR0 region
|
||||
const size_t bar0_size = vfioDevice->regionGetSize(VFIO_PCI_BAR0_REGION_INDEX);
|
||||
|
||||
|
||||
/* Link mapped BAR0 to global memory graph */
|
||||
|
||||
// get the address space of the current application
|
||||
const auto villasAddrSpace = mm.getProcessAddressSpace();
|
||||
|
||||
// get the address space for the PCIe proxy we use with VFIO
|
||||
const auto cardPCIeAddrSpaceName = mm.getMasterAddrSpaceName(name, "PCIe");
|
||||
|
||||
// create a new address space for this FPGA card
|
||||
addrSpaceIdHostToDevice = mm.getOrCreateAddressSpace(cardPCIeAddrSpaceName);
|
||||
|
||||
// create a mapping from our address space to the FPGA card via vfio
|
||||
mm.createMapping(reinterpret_cast<uintptr_t>(bar0_mapped),
|
||||
0, bar0_size, "VFIO_map",
|
||||
villasAddrSpace, addrSpaceIdHostToDevice);
|
||||
|
||||
|
||||
/* Reset system? */
|
||||
if (do_reset) {
|
||||
/* Reset / detect PCI device */
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
|
||||
#include "memory.hpp"
|
||||
|
||||
namespace villas {
|
||||
|
@ -62,6 +65,8 @@ LinearAllocator::LinearAllocator(MemoryManager::AddressSpaceId memoryAddrSpaceId
|
|||
mem->getSize(), mem->getOffset(), mem->getAddrSpaceId());
|
||||
logger->warn("free() not implemented");
|
||||
logger->debug("available memory: {:#x} bytes", getAvailableMemory());
|
||||
|
||||
removeMemoryBlock(*mem);
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -133,7 +138,127 @@ HostRam::HostRamAllocator::HostRamAllocator() :
|
|||
logger->warn("munmap() failed for {:#x} of size {:#x}",
|
||||
mem->getOffset(), mem->getSize());
|
||||
}
|
||||
|
||||
removeMemoryBlock(*mem);
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
std::map<int, std::unique_ptr<HostDmaRam::HostDmaRamAllocator>>
|
||||
HostDmaRam::allocators;
|
||||
|
||||
HostDmaRam::HostDmaRamAllocator::HostDmaRamAllocator(int num) :
|
||||
LinearAllocator(MemoryManager::get().getOrCreateAddressSpace(getUdmaBufName(num)), getUdmaBufBufSize(num)),
|
||||
num(num)
|
||||
{
|
||||
auto& mm = MemoryManager::get();
|
||||
logger = loggerGetOrCreate(getName());
|
||||
|
||||
if(getSize() == 0) {
|
||||
logger->error("Zero-sized DMA buffer not supported, is the kernel module loaded?");
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
|
||||
const uintptr_t base = getUdmaBufPhysAddr(num);
|
||||
|
||||
mm.createMapping(base, 0, getSize(), getName() + "-PCI",
|
||||
mm.getPciAddressSpace(), getAddrSpaceId());
|
||||
|
||||
const auto bufPath = std::string("/dev/") + getUdmaBufName(num);
|
||||
const int bufFd = open(bufPath.c_str(), O_RDWR | O_SYNC);
|
||||
if(bufFd != -1) {
|
||||
void* buf = mmap(nullptr, getSize(), PROT_READ|PROT_WRITE, MAP_SHARED, bufFd, 0);
|
||||
close(bufFd);
|
||||
|
||||
if(buf != MAP_FAILED) {
|
||||
mm.createMapping(reinterpret_cast<uintptr_t>(buf), 0, getSize(),
|
||||
getName() + "-VA",
|
||||
mm.getProcessAddressSpace(), getAddrSpaceId());
|
||||
} else {
|
||||
logger->warn("Cannot map {}", bufPath);
|
||||
}
|
||||
} else {
|
||||
logger->warn("Cannot open {}", bufPath);
|
||||
}
|
||||
|
||||
logger->info("Mapped {} of size {} bytes", bufPath, getSize());
|
||||
}
|
||||
|
||||
HostDmaRam::HostDmaRamAllocator::~HostDmaRamAllocator()
|
||||
{
|
||||
auto& mm = MemoryManager::get();
|
||||
|
||||
void* baseVirt;
|
||||
try {
|
||||
auto translation = mm.getTranslationFromProcess(getAddrSpaceId());
|
||||
baseVirt = reinterpret_cast<void*>(translation.getLocalAddr(0));
|
||||
} catch(const std::out_of_range&) {
|
||||
// not mapped, nothing to do
|
||||
return;
|
||||
}
|
||||
|
||||
logger->debug("Unmapping {}", getName());
|
||||
|
||||
// try to unmap it
|
||||
if(::munmap(baseVirt, getSize()) != 0) {
|
||||
logger->warn("munmap() failed for {:p} of size {:#x}",
|
||||
baseVirt, getSize());
|
||||
}
|
||||
}
|
||||
|
||||
std::string
|
||||
HostDmaRam::getUdmaBufName(int num)
|
||||
{
|
||||
std::stringstream name;
|
||||
name << "udmabuf" << num;
|
||||
|
||||
return name.str();
|
||||
}
|
||||
|
||||
std::string
|
||||
HostDmaRam::getUdmaBufBasePath(int num)
|
||||
{
|
||||
std::stringstream path;
|
||||
path << "/sys/class/udmabuf/udmabuf" << num << "/";
|
||||
return path.str();
|
||||
}
|
||||
|
||||
size_t
|
||||
HostDmaRam::getUdmaBufBufSize(int num)
|
||||
{
|
||||
std::fstream s(getUdmaBufBasePath(num) + "size", s.in);
|
||||
if(s.is_open()) {
|
||||
std::string line;
|
||||
if(std::getline(s, line)) {
|
||||
return std::strtoul(line.c_str(), nullptr, 10);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
uintptr_t
|
||||
HostDmaRam::getUdmaBufPhysAddr(int num)
|
||||
{
|
||||
std::fstream s(getUdmaBufBasePath(num) + "phys_addr", s.in);
|
||||
if(s.is_open()) {
|
||||
std::string line;
|
||||
if(std::getline(s, line)) {
|
||||
return std::strtoul(line.c_str(), nullptr, 16);
|
||||
}
|
||||
}
|
||||
|
||||
return UINTPTR_MAX;
|
||||
}
|
||||
|
||||
HostDmaRam::HostDmaRamAllocator&HostDmaRam::getAllocator(int num)
|
||||
{
|
||||
auto& allocator = allocators[num];
|
||||
if(not allocator) {
|
||||
allocator = std::make_unique<HostDmaRamAllocator>(num);
|
||||
}
|
||||
|
||||
return *allocator;
|
||||
}
|
||||
|
||||
} // namespace villas
|
||||
|
|
|
@ -2,8 +2,11 @@
|
|||
#include <limits>
|
||||
#include <cstdint>
|
||||
|
||||
#include <villas/utils.hpp>
|
||||
#include "memory_manager.hpp"
|
||||
|
||||
using namespace villas::utils;
|
||||
|
||||
namespace villas {
|
||||
|
||||
MemoryManager*
|
||||
|
@ -70,17 +73,44 @@ MemoryManager::findAddressSpace(const std::string& name)
|
|||
});
|
||||
}
|
||||
|
||||
std::list<MemoryManager::AddressSpaceId>
|
||||
MemoryManager::findPath(MemoryManager::AddressSpaceId fromAddrSpaceId,
|
||||
MemoryManager::AddressSpaceId toAddrSpaceId)
|
||||
{
|
||||
std::list<AddressSpaceId> path;
|
||||
|
||||
auto fromAddrSpace = memoryGraph.getVertex(fromAddrSpaceId);
|
||||
auto toAddrSpace = memoryGraph.getVertex(toAddrSpaceId);
|
||||
|
||||
// find a path through the memory graph
|
||||
MemoryGraph::Path pathGraph;
|
||||
if(not memoryGraph.getPath(fromAddrSpaceId, toAddrSpaceId, pathGraph, pathCheckFunc)) {
|
||||
|
||||
logger->debug("No translation found from ({}) to ({})",
|
||||
*fromAddrSpace, *toAddrSpace);
|
||||
|
||||
throw std::out_of_range("no translation found");
|
||||
}
|
||||
|
||||
for(auto& mappingId : pathGraph) {
|
||||
auto mapping = memoryGraph.getEdge(mappingId);
|
||||
path.push_back(mapping->getVertexTo());
|
||||
}
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
MemoryTranslation
|
||||
MemoryManager::getTranslation(MemoryManager::AddressSpaceId fromAddrSpaceId,
|
||||
MemoryManager::AddressSpaceId toAddrSpaceId)
|
||||
{
|
||||
// find a path through the memory graph
|
||||
MemoryGraph::Path path;
|
||||
if(not memoryGraph.getPath(fromAddrSpaceId, toAddrSpaceId, path)) {
|
||||
if(not memoryGraph.getPath(fromAddrSpaceId, toAddrSpaceId, path, pathCheckFunc)) {
|
||||
|
||||
auto fromAddrSpace = memoryGraph.getVertex(fromAddrSpaceId);
|
||||
auto toAddrSpace = memoryGraph.getVertex(toAddrSpaceId);
|
||||
|
||||
logger->error("No translation found from ({}) to ({})",
|
||||
logger->debug("No translation found from ({}) to ({})",
|
||||
*fromAddrSpace, *toAddrSpace);
|
||||
|
||||
throw std::out_of_range("no translation found");
|
||||
|
@ -98,6 +128,26 @@ MemoryManager::getTranslation(MemoryManager::AddressSpaceId fromAddrSpaceId,
|
|||
return translation;
|
||||
}
|
||||
|
||||
bool
|
||||
MemoryManager::pathCheck(const MemoryGraph::Path& path)
|
||||
{
|
||||
// start with an identity mapping
|
||||
MemoryTranslation translation(0, 0, SIZE_MAX);
|
||||
|
||||
// Try to add all mappings together to a common translation. If this fails
|
||||
// there is a non-overlapping window
|
||||
for(auto& mappingId : path) {
|
||||
auto mapping = memoryGraph.getEdge(mappingId);
|
||||
try {
|
||||
translation += getTranslationFromMapping(*mapping);
|
||||
} catch(const InvalidTranslation&) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
uintptr_t
|
||||
MemoryTranslation::getLocalAddr(uintptr_t addrInForeignAddrSpace) const
|
||||
{
|
||||
|
@ -124,9 +174,18 @@ MemoryTranslation::operator+=(const MemoryTranslation& other)
|
|||
const uintptr_t this_dst_high = this->dst + this->size;
|
||||
const uintptr_t other_src_high = other.src + other.size;
|
||||
|
||||
logger->debug("this->src: {:#x}", this->src);
|
||||
logger->debug("this->dst: {:#x}", this->dst);
|
||||
logger->debug("this->size: {:#x}", this->size);
|
||||
logger->debug("other.src: {:#x}", other.src);
|
||||
logger->debug("other.dst: {:#x}", other.dst);
|
||||
logger->debug("other.size: {:#x}", other.size);
|
||||
logger->debug("this_dst_high: {:#x}", this_dst_high);
|
||||
logger->debug("other_src_high: {:#x}", other_src_high);
|
||||
|
||||
// make sure there is a common memory area
|
||||
assert(other.src < this_dst_high);
|
||||
assert(this->dst < other_src_high);
|
||||
assertExcept(other.src < this_dst_high, MemoryManager::InvalidTranslation());
|
||||
assertExcept(this->dst < other_src_high, MemoryManager::InvalidTranslation());
|
||||
|
||||
const uintptr_t hi = std::max(this_dst_high, other_src_high);
|
||||
const uintptr_t lo = std::min(this->dst, other.src);
|
||||
|
@ -135,33 +194,39 @@ MemoryTranslation::operator+=(const MemoryTranslation& other)
|
|||
? (this_dst_high - other_src_high)
|
||||
: (other_src_high - this_dst_high);
|
||||
|
||||
const uintptr_t diff_lo = (this->dst > other.src)
|
||||
const bool otherSrcIsSmaller = this->dst > other.src;
|
||||
const uintptr_t diff_lo = (otherSrcIsSmaller)
|
||||
? (this->dst - other.src)
|
||||
: (other.src - this->dst);
|
||||
|
||||
const size_t size = (hi - lo) - diff_hi - diff_lo;
|
||||
logger->debug("hi: {:#x}", hi);
|
||||
logger->debug("lo: {:#x}", lo);
|
||||
logger->debug("diff_hi: {:#x}", diff_hi);
|
||||
logger->debug("diff_lo: {:#x}", diff_lo);
|
||||
|
||||
logger->debug("this->src: 0x{:x}", this->src);
|
||||
logger->debug("this->dst: 0x{:x}", this->dst);
|
||||
logger->debug("this->size: 0x{:x}", this->size);
|
||||
logger->debug("other.src: 0x{:x}", other.src);
|
||||
logger->debug("other.dst: 0x{:x}", other.dst);
|
||||
logger->debug("other.size: 0x{:x}", other.size);
|
||||
logger->debug("this_dst_high: 0x{:x}", this_dst_high);
|
||||
logger->debug("other_src_high: 0x{:x}", other_src_high);
|
||||
logger->debug("hi: 0x{:x}", hi);
|
||||
logger->debug("lo: 0x{:x}", lo);
|
||||
logger->debug("diff_hi: 0x{:x}", diff_hi);
|
||||
logger->debug("diff_hi: 0x{:x}", diff_lo);
|
||||
logger->debug("size: 0x{:x}", size);
|
||||
// new size of aperture, can only stay or shrink
|
||||
this->size = (hi - lo) - diff_hi - diff_lo;
|
||||
|
||||
this->src += other.src;
|
||||
this->dst += other.dst;
|
||||
this->size = size;
|
||||
// new translation will come out other's destination (by default)
|
||||
this->dst = other.dst;
|
||||
|
||||
logger->debug("result src: 0x{:x}", this->src);
|
||||
logger->debug("result dst: 0x{:x}", this->dst);
|
||||
logger->debug("result size: 0x{:x}", this->size);
|
||||
// the source stays the same and can only increase with merged translations
|
||||
this->src = this->src;
|
||||
|
||||
if(otherSrcIsSmaller) {
|
||||
// other mapping starts at lower addresses, so we actually arrive at
|
||||
// higher addresses
|
||||
this->dst += diff_lo;
|
||||
} else {
|
||||
// other mapping starts at higher addresses than this, so we have to
|
||||
// increase the start
|
||||
// NOTE: for addresses equality, this just adds 0
|
||||
this->src += diff_lo;
|
||||
}
|
||||
|
||||
logger->debug("result src: {:#x}", this->src);
|
||||
logger->debug("result dst: {:#x}", this->dst);
|
||||
logger->debug("result size: {:#x}", this->size);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
|
26
fpga/lib/gpu/CMakeLists.txt
Normal file
26
fpga/lib/gpu/CMakeLists.txt
Normal file
|
@ -0,0 +1,26 @@
|
|||
cmake_minimum_required(VERSION 3.8)
|
||||
|
||||
project(villas-gpu
|
||||
VERSION 1.0
|
||||
DESCRIPTION "VILLASgpu"
|
||||
LANGUAGES C CXX CUDA)
|
||||
|
||||
add_subdirectory(gdrcopy)
|
||||
|
||||
add_library(villas-gpu SHARED gpu.cpp kernels.cu)
|
||||
|
||||
target_compile_options(villas-gpu PRIVATE -g)
|
||||
|
||||
set_source_files_properties(gpu.cpp PROPERTIES
|
||||
LANGUAGE CUDA)
|
||||
|
||||
target_include_directories(villas-gpu PRIVATE /opt/cuda/include)
|
||||
|
||||
target_link_libraries(villas-gpu
|
||||
PRIVATE villas-common gdrapi cuda)
|
||||
|
||||
target_include_directories(villas-gpu
|
||||
PUBLIC
|
||||
${CMAKE_CURRENT_LIST_DIR}/include
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_LIST_DIR})
|
1
fpga/lib/gpu/gdrcopy
Submodule
1
fpga/lib/gpu/gdrcopy
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit 0441daa447b80260c4e11096f03e88f7be08bfa2
|
474
fpga/lib/gpu/gpu.cpp
Normal file
474
fpga/lib/gpu/gpu.cpp
Normal file
|
@ -0,0 +1,474 @@
|
|||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <sys/mman.h>
|
||||
|
||||
#include <memory>
|
||||
#include <algorithm>
|
||||
|
||||
#include <villas/gpu.hpp>
|
||||
#include <villas/log.hpp>
|
||||
#include <villas/kernel/pci.h>
|
||||
#include <memory_manager.hpp>
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <gdrapi.h>
|
||||
|
||||
#include "kernels.hpp"
|
||||
|
||||
namespace villas {
|
||||
namespace gpu {
|
||||
|
||||
static GpuFactory gpuFactory;
|
||||
|
||||
GpuAllocator::GpuAllocator(Gpu& gpu) :
|
||||
BaseAllocator(gpu.masterPciEAddrSpaceId),
|
||||
gpu(gpu)
|
||||
{
|
||||
free = [&](MemoryBlock* mem) {
|
||||
cudaSetDevice(gpu.gpuId);
|
||||
if(cudaFree(reinterpret_cast<void*>(mem->getOffset())) != cudaSuccess) {
|
||||
logger->warn("cudaFree() failed for {:#x} of size {:#x}",
|
||||
mem->getOffset(), mem->getSize());
|
||||
}
|
||||
|
||||
removeMemoryBlock(*mem);
|
||||
};
|
||||
}
|
||||
|
||||
std::string
|
||||
villas::gpu::GpuAllocator::getName() const
|
||||
{
|
||||
std::stringstream name;
|
||||
name << "GpuAlloc" << getAddrSpaceId();
|
||||
return name.str();
|
||||
}
|
||||
|
||||
|
||||
GpuFactory::GpuFactory() :
|
||||
Plugin(Plugin::Type::Gpu, "GPU")
|
||||
{
|
||||
logger = loggerGetOrCreate("GpuFactory");
|
||||
}
|
||||
|
||||
// required to be defined here for PIMPL to compile
|
||||
Gpu::~Gpu()
|
||||
{
|
||||
auto& mm = MemoryManager::get();
|
||||
mm.removeAddressSpace(masterPciEAddrSpaceId);
|
||||
}
|
||||
|
||||
|
||||
// we use PIMPL in order to hide gdrcopy types from the public header
|
||||
class Gpu::impl {
|
||||
public:
|
||||
gdr_t gdr;
|
||||
struct pci_device pdev;
|
||||
};
|
||||
|
||||
std::string Gpu::getName() const
|
||||
{
|
||||
cudaDeviceProp deviceProp;
|
||||
if(cudaGetDeviceProperties(&deviceProp, gpuId) != cudaSuccess) {
|
||||
// logger not yet availabe
|
||||
loggerGetOrCreate("Gpu")->error("Cannot retrieve properties for GPU {}", gpuId);
|
||||
throw std::exception();
|
||||
}
|
||||
|
||||
std::stringstream name;
|
||||
name << "gpu" << gpuId << "(" << deviceProp.name << ")";
|
||||
|
||||
return name.str();
|
||||
}
|
||||
|
||||
bool Gpu::registerIoMemory(const MemoryBlock& mem)
|
||||
{
|
||||
auto& mm = MemoryManager::get();
|
||||
const auto pciAddrSpaceId = mm.getPciAddressSpace();
|
||||
|
||||
// Check if we need to map anything at all, maybe it's already reachable
|
||||
try {
|
||||
// TODO: there might already be a path through the graph, but there's no
|
||||
// overlapping window, so this will fail badly!
|
||||
auto translation = mm.getTranslation(masterPciEAddrSpaceId,
|
||||
mem.getAddrSpaceId());
|
||||
if(translation.getSize() >= mem.getSize()) {
|
||||
// there is already a sufficient path
|
||||
logger->debug("Already mapped through another mapping");
|
||||
return true;
|
||||
} else {
|
||||
logger->warn("There's already a mapping, but too small");
|
||||
}
|
||||
} catch(const std::out_of_range&) {
|
||||
// not yet reachable, that's okay, proceed
|
||||
}
|
||||
|
||||
|
||||
// In order to register IO memory with CUDA, it has to be mapped to the VA
|
||||
// space of the current process (requirement of CUDA API). Check this now.
|
||||
MemoryManager::AddressSpaceId mappedBaseAddrSpaceId;
|
||||
try {
|
||||
auto path = mm.findPath(mm.getProcessAddressSpace(), mem.getAddrSpaceId());
|
||||
// first node in path is the mapped memory space whose virtual address
|
||||
// we need to hand to CUDA
|
||||
mappedBaseAddrSpaceId = path.front();
|
||||
} catch (const std::out_of_range&) {
|
||||
logger->error("Memory not reachable from process, but required by CUDA");
|
||||
return false;
|
||||
}
|
||||
|
||||
// determine the base address of the mapped memory region needed by CUDA
|
||||
const auto translationProcess = mm.getTranslationFromProcess(mappedBaseAddrSpaceId);
|
||||
const uintptr_t baseAddrForProcess = translationProcess.getLocalAddr(0);
|
||||
|
||||
|
||||
// Now check that the memory is also reachable via PCIe bus, otherwise GPU
|
||||
// has no means to access it.
|
||||
uintptr_t baseAddrOnPci;
|
||||
size_t sizeOnPci;
|
||||
try {
|
||||
auto translationPci = mm.getTranslation(pciAddrSpaceId,
|
||||
mappedBaseAddrSpaceId);
|
||||
baseAddrOnPci = translationPci.getLocalAddr(0);
|
||||
sizeOnPci = translationPci.getSize();
|
||||
} catch(const std::out_of_range&) {
|
||||
logger->error("Memory is not reachable via PCIe bus");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(sizeOnPci < mem.getSize()) {
|
||||
logger->warn("VA mapping of IO memory is too small: {:#x} instead of {:#x} bytes",
|
||||
sizeOnPci, mem.getSize());
|
||||
logger->warn("If something later on fails or behaves strangely, this might be the cause!");
|
||||
}
|
||||
|
||||
|
||||
cudaSetDevice(gpuId);
|
||||
|
||||
auto baseAddrVA = reinterpret_cast<void*>(baseAddrForProcess);
|
||||
if(cudaHostRegister(baseAddrVA, sizeOnPci, cudaHostRegisterIoMemory) != cudaSuccess) {
|
||||
logger->error("Cannot register IO memory for block {}", mem.getAddrSpaceId());
|
||||
return false;
|
||||
}
|
||||
|
||||
void* devicePointer = nullptr;
|
||||
if(cudaHostGetDevicePointer(&devicePointer, baseAddrVA, 0) != cudaSuccess) {
|
||||
logger->error("Cannot retrieve device pointer for IO memory");
|
||||
return false;
|
||||
}
|
||||
|
||||
mm.createMapping(reinterpret_cast<uintptr_t>(devicePointer), baseAddrOnPci,
|
||||
sizeOnPci, "CudaIoMem", masterPciEAddrSpaceId, pciAddrSpaceId);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
Gpu::registerHostMemory(const MemoryBlock& mem)
|
||||
{
|
||||
auto& mm = MemoryManager::get();
|
||||
|
||||
auto translation = mm.getTranslationFromProcess(mem.getAddrSpaceId());
|
||||
auto localBase = reinterpret_cast<void*>(translation.getLocalAddr(0));
|
||||
|
||||
int ret = cudaHostRegister(localBase, mem.getSize(), 0);
|
||||
if(ret != cudaSuccess) {
|
||||
logger->error("Cannot register memory block {} addr={:p} size={:#x} to CUDA: ret={}",
|
||||
mem.getAddrSpaceId(), localBase, mem.getSize(), ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
void* devicePointer = nullptr;
|
||||
ret = cudaHostGetDevicePointer(&devicePointer, localBase, 0);
|
||||
if(ret != cudaSuccess) {
|
||||
logger->error("Cannot retrieve device pointer for IO memory: ret={}", ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
mm.createMapping(reinterpret_cast<uintptr_t>(devicePointer), 0, mem.getSize(),
|
||||
"CudaHostMem", masterPciEAddrSpaceId, mem.getAddrSpaceId());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Gpu::makeAccessibleToPCIeAndVA(const MemoryBlock& mem)
|
||||
{
|
||||
if(pImpl->gdr == nullptr) {
|
||||
logger->warn("GDRcopy not available");
|
||||
return false;
|
||||
}
|
||||
|
||||
auto& mm = MemoryManager::get();
|
||||
|
||||
try {
|
||||
auto path = mm.findPath(masterPciEAddrSpaceId, mem.getAddrSpaceId());
|
||||
// if first hop is the PCIe bus, we know that memory is off-GPU
|
||||
if(path.front() == mm.getPciAddressSpace()) {
|
||||
throw std::out_of_range("Memory block is outside of this GPU");
|
||||
}
|
||||
|
||||
} catch (const std::out_of_range&) {
|
||||
logger->error("Trying to map non-GPU memory block");
|
||||
return false;
|
||||
}
|
||||
|
||||
logger->debug("retrieve complete device pointer from point of view of GPU");
|
||||
// retrieve complete device pointer from point of view of GPU
|
||||
auto translation = mm.getTranslation(masterPciEAddrSpaceId,
|
||||
mem.getAddrSpaceId());
|
||||
CUdeviceptr devptr = translation.getLocalAddr(0);
|
||||
|
||||
int ret;
|
||||
|
||||
// required to set this flag before mapping
|
||||
unsigned int enable = 1;
|
||||
ret = cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, devptr);
|
||||
if(ret != CUDA_SUCCESS) {
|
||||
logger->error("Cannot set pointer attributes on memory block {}: {}",
|
||||
mem.getAddrSpaceId(), ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
gdr_mh_t mh;
|
||||
ret = gdr_pin_buffer(pImpl->gdr, devptr, mem.getSize(), 0, 0, &mh);
|
||||
if(ret != 0) {
|
||||
logger->error("Cannot pin memory block {} via gdrcopy: {}",
|
||||
mem.getAddrSpaceId(), ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
void* bar = nullptr;
|
||||
ret = gdr_map(pImpl->gdr, mh, &bar, mem.getSize());
|
||||
if(ret != 0) {
|
||||
logger->error("Cannot map memory block {} via gdrcopy: {}",
|
||||
mem.getAddrSpaceId(), ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
gdr_info_t info;
|
||||
ret = gdr_get_info(pImpl->gdr, mh, &info);
|
||||
if(ret != 0) {
|
||||
logger->error("Cannot get info for mapping of memory block {}: {}",
|
||||
mem.getAddrSpaceId(), ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
const uintptr_t offset = info.va - devptr;
|
||||
const uintptr_t userPtr = reinterpret_cast<uintptr_t>(bar) + offset;
|
||||
|
||||
logger->debug("BAR ptr: {:p}", bar);
|
||||
logger->debug("info.va: {:#x}", info.va);
|
||||
logger->debug("info.mapped_size: {:#x}", info.mapped_size);
|
||||
logger->debug("info.page_size: {:#x}", info.page_size);
|
||||
logger->debug("offset: {:#x}", offset);
|
||||
logger->debug("user pointer: {:#x}", userPtr);
|
||||
|
||||
// mapping to acceses memory block from process
|
||||
mm.createMapping(userPtr, 0, info.mapped_size, "GDRcopy",
|
||||
mm.getProcessAddressSpace(), mem.getAddrSpaceId());
|
||||
|
||||
// retrieve bus address
|
||||
uint64_t addr[8];
|
||||
ret = gdr_map_dma(pImpl->gdr, mh, 3, 0, 0, addr, 8);
|
||||
|
||||
for(int i = 0; i < ret; i++) {
|
||||
logger->debug("DMA addr[{}]: {:#x}", i, addr[i]);
|
||||
}
|
||||
|
||||
if(ret != 1) {
|
||||
logger->error("Only one DMA address per block supported at the moment");
|
||||
return false;
|
||||
}
|
||||
|
||||
// mapping to access memory block from peer devices via PCIe
|
||||
mm.createMapping(addr[0], 0, mem.getSize(), "GDRcopyDMA",
|
||||
mm.getPciAddressSpace(), mem.getAddrSpaceId());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
Gpu::makeAccessibleFromPCIeOrHostRam(const MemoryBlock& mem)
|
||||
{
|
||||
// Check which kind of memory this is and where it resides
|
||||
// There are two possibilities:
|
||||
// - Host memory not managed by CUDA
|
||||
// - IO memory somewhere on the PCIe bus
|
||||
|
||||
auto& mm = MemoryManager::get();
|
||||
|
||||
bool isIoMemory = false;
|
||||
try {
|
||||
auto path = mm.findPath(mm.getPciAddressSpace(), mem.getAddrSpaceId());
|
||||
isIoMemory = true;
|
||||
} catch(const std::out_of_range&) {
|
||||
// not reachable via PCI -> not IO memory
|
||||
}
|
||||
|
||||
if(isIoMemory) {
|
||||
logger->debug("Memory block {} is assumed to be IO memory",
|
||||
mem.getAddrSpaceId());
|
||||
|
||||
return registerIoMemory(mem);
|
||||
} else {
|
||||
logger->debug("Memory block {} is assumed to be non-CUDA host memory",
|
||||
mem.getAddrSpaceId());
|
||||
|
||||
return registerHostMemory(mem);
|
||||
}
|
||||
}
|
||||
|
||||
void Gpu::memcpySync(const MemoryBlock& src, const MemoryBlock& dst, size_t size)
|
||||
{
|
||||
auto& mm = MemoryManager::get();
|
||||
|
||||
auto src_translation = mm.getTranslation(masterPciEAddrSpaceId,
|
||||
src.getAddrSpaceId());
|
||||
const void* src_buf = reinterpret_cast<void*>(src_translation.getLocalAddr(0));
|
||||
|
||||
auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId,
|
||||
dst.getAddrSpaceId());
|
||||
void* dst_buf = reinterpret_cast<void*>(dst_translation.getLocalAddr(0));
|
||||
|
||||
cudaSetDevice(gpuId);
|
||||
cudaMemcpy(dst_buf, src_buf, size, cudaMemcpyDefault);
|
||||
}
|
||||
|
||||
void Gpu::memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size)
|
||||
{
|
||||
auto& mm = MemoryManager::get();
|
||||
|
||||
auto src_translation = mm.getTranslation(masterPciEAddrSpaceId,
|
||||
src.getAddrSpaceId());
|
||||
auto src_buf = reinterpret_cast<uint8_t*>(src_translation.getLocalAddr(0));
|
||||
|
||||
auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId,
|
||||
dst.getAddrSpaceId());
|
||||
auto dst_buf = reinterpret_cast<uint8_t*>(dst_translation.getLocalAddr(0));
|
||||
|
||||
cudaSetDevice(gpuId);
|
||||
kernel_memcpy<<<1, 1>>>(dst_buf, src_buf, size);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
std::unique_ptr<villas::MemoryBlock, villas::MemoryBlock::deallocator_fn>
|
||||
GpuAllocator::allocateBlock(size_t size)
|
||||
{
|
||||
cudaSetDevice(gpu.gpuId);
|
||||
|
||||
void* addr;
|
||||
if(cudaSuccess != cudaMalloc(&addr, size)) {
|
||||
logger->error("cudaMalloc(..., size={}) failed", size);
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
|
||||
auto& mm = MemoryManager::get();
|
||||
|
||||
// assemble name for this block
|
||||
std::stringstream name;
|
||||
name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
|
||||
|
||||
auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
|
||||
auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
|
||||
|
||||
const auto localAddr = reinterpret_cast<uintptr_t>(addr);
|
||||
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
|
||||
mem(new MemoryBlock(localAddr, size, blockAddrSpaceId), this->free);
|
||||
|
||||
insertMemoryBlock(*mem);
|
||||
|
||||
gpu.makeAccessibleToPCIeAndVA(*mem);
|
||||
|
||||
return mem;
|
||||
}
|
||||
|
||||
|
||||
Gpu::Gpu(int gpuId) :
|
||||
pImpl{std::make_unique<impl>()},
|
||||
gpuId(gpuId)
|
||||
{
|
||||
logger = loggerGetOrCreate(getName());
|
||||
|
||||
pImpl->gdr = gdr_open();
|
||||
if(pImpl->gdr == nullptr) {
|
||||
logger->warn("No GDRcopy support enabled, cannot open /dev/gdrdrv");
|
||||
}
|
||||
}
|
||||
|
||||
bool Gpu::init()
|
||||
{
|
||||
auto& mm = MemoryManager::get();
|
||||
|
||||
const auto gpuPciEAddrSpaceName = mm.getMasterAddrSpaceName(getName(), "PCIe");
|
||||
masterPciEAddrSpaceId = mm.getOrCreateAddressSpace(gpuPciEAddrSpaceName);
|
||||
|
||||
allocator = std::make_unique<GpuAllocator>(*this);
|
||||
|
||||
cudaDeviceProp deviceProp;
|
||||
cudaGetDeviceProperties(&deviceProp, gpuId);
|
||||
|
||||
pImpl->pdev.slot = {
|
||||
deviceProp.pciDomainID,
|
||||
deviceProp.pciBusID,
|
||||
deviceProp.pciDeviceID,
|
||||
0};
|
||||
|
||||
struct pci_region* pci_regions = nullptr;
|
||||
const size_t pci_num_regions = pci_get_regions(&pImpl->pdev, &pci_regions);
|
||||
for(size_t i = 0; i < pci_num_regions; i++) {
|
||||
const size_t region_size = pci_regions[i].end - pci_regions[i].start + 1;
|
||||
logger->info("BAR{}: bus addr={:#x} size={:#x}",
|
||||
pci_regions[i].num, pci_regions[i].start, region_size);
|
||||
|
||||
char name[] = "BARx";
|
||||
name[3] = '0' + pci_regions[i].num;
|
||||
|
||||
auto gpuBarXAddrSpaceName = mm.getSlaveAddrSpaceName(getName(), name);
|
||||
auto gpuBarXAddrSpaceId = mm.getOrCreateAddressSpace(gpuBarXAddrSpaceName);
|
||||
|
||||
mm.createMapping(pci_regions[i].start, 0, region_size,
|
||||
std::string("PCI-") + name,
|
||||
mm.getPciAddressSpace(), gpuBarXAddrSpaceId);
|
||||
}
|
||||
|
||||
free(pci_regions);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
std::list<std::unique_ptr<Gpu>>
|
||||
GpuFactory::make()
|
||||
{
|
||||
int deviceCount = 0;
|
||||
cudaGetDeviceCount(&deviceCount);
|
||||
|
||||
std::list<std::unique_ptr<Gpu>> gpuList;
|
||||
|
||||
for(int gpuId = 0; gpuId < deviceCount; gpuId++) {
|
||||
if(cudaSetDevice(gpuId) != cudaSuccess) {
|
||||
logger->warn("Cannot activate GPU {}", gpuId);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto gpu = std::make_unique<Gpu>(gpuId);
|
||||
|
||||
if(not gpu->init()) {
|
||||
logger->warn("Cannot initialize GPU {}", gpuId);
|
||||
continue;
|
||||
}
|
||||
|
||||
gpuList.emplace_back(std::move(gpu));
|
||||
}
|
||||
|
||||
logger->info("Initialized {} GPUs", gpuList.size());
|
||||
for(auto& gpu : gpuList) {
|
||||
logger->debug(" - {}", gpu->getName());
|
||||
}
|
||||
|
||||
return gpuList;
|
||||
}
|
||||
|
||||
} // namespace villas
|
||||
} // namespace gpu
|
87
fpga/lib/gpu/include/villas/gpu.hpp
Normal file
87
fpga/lib/gpu/include/villas/gpu.hpp
Normal file
|
@ -0,0 +1,87 @@
|
|||
#pragma once
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#include <plugin.hpp>
|
||||
#include <memory_manager.hpp>
|
||||
#include <memory.hpp>
|
||||
#include <villas/log.hpp>
|
||||
|
||||
|
||||
namespace villas {
|
||||
namespace gpu {
|
||||
|
||||
class GpuAllocator;
|
||||
|
||||
class Gpu {
|
||||
friend GpuAllocator;
|
||||
public:
|
||||
Gpu(int gpuId);
|
||||
~Gpu();
|
||||
|
||||
bool init();
|
||||
|
||||
std::string getName() const;
|
||||
|
||||
GpuAllocator& getAllocator() const
|
||||
{ return *allocator; }
|
||||
|
||||
|
||||
bool makeAccessibleToPCIeAndVA(const MemoryBlock& mem);
|
||||
|
||||
/// Make some memory block accssible for this GPU
|
||||
bool makeAccessibleFromPCIeOrHostRam(const MemoryBlock& mem);
|
||||
|
||||
void memcpySync(const MemoryBlock& src, const MemoryBlock& dst, size_t size);
|
||||
|
||||
void memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size);
|
||||
|
||||
private:
|
||||
bool registerIoMemory(const MemoryBlock& mem);
|
||||
bool registerHostMemory(const MemoryBlock& mem);
|
||||
|
||||
private:
|
||||
class impl;
|
||||
std::unique_ptr<impl> pImpl;
|
||||
|
||||
// master, will be used to derived slave addr spaces for allocation
|
||||
MemoryManager::AddressSpaceId masterPciEAddrSpaceId;
|
||||
|
||||
MemoryManager::AddressSpaceId slaveMemoryAddrSpaceId;
|
||||
|
||||
SpdLogger logger;
|
||||
|
||||
int gpuId;
|
||||
|
||||
std::unique_ptr<GpuAllocator> allocator;
|
||||
};
|
||||
|
||||
|
||||
class GpuAllocator : public BaseAllocator<GpuAllocator> {
|
||||
public:
|
||||
GpuAllocator(Gpu& gpu);
|
||||
|
||||
std::string getName() const;
|
||||
|
||||
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
|
||||
allocateBlock(size_t size);
|
||||
|
||||
private:
|
||||
Gpu& gpu;
|
||||
};
|
||||
|
||||
class GpuFactory : public Plugin {
|
||||
public:
|
||||
GpuFactory();
|
||||
|
||||
std::list<std::unique_ptr<Gpu>>
|
||||
make();
|
||||
|
||||
void run(void*);
|
||||
|
||||
private:
|
||||
SpdLogger logger;
|
||||
};
|
||||
|
||||
} // namespace villas
|
||||
} // namespace gpu
|
42
fpga/lib/gpu/kernels.cu
Normal file
42
fpga/lib/gpu/kernels.cu
Normal file
|
@ -0,0 +1,42 @@
|
|||
#include <stdio.h>
|
||||
|
||||
#include <villas/gpu.hpp>
|
||||
#include "kernels.hpp"
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include <cuda.h>
|
||||
|
||||
namespace villas {
|
||||
namespace gpu {
|
||||
|
||||
|
||||
__global__ void
|
||||
kernel_mailbox(volatile uint32_t *mailbox, volatile uint32_t* counter)
|
||||
{
|
||||
printf("[gpu] hello!\n");
|
||||
printf("[gpu] mailbox: %p\n", mailbox);
|
||||
|
||||
printf("[kernel] started\n");
|
||||
|
||||
while(1) {
|
||||
if (*mailbox == 1) {
|
||||
*mailbox = 0;
|
||||
printf("[gpu] counter = %d\n", *counter);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
printf("[gpu] quit\n");
|
||||
}
|
||||
|
||||
__global__ void
|
||||
kernel_memcpy(volatile uint8_t* dst, volatile uint8_t* src, size_t length)
|
||||
{
|
||||
while(length > 0) {
|
||||
*dst++ = *src++;
|
||||
length--;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace villas
|
||||
} // namespace gpu
|
16
fpga/lib/gpu/kernels.hpp
Normal file
16
fpga/lib/gpu/kernels.hpp
Normal file
|
@ -0,0 +1,16 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace villas {
|
||||
namespace gpu {
|
||||
|
||||
__global__ void
|
||||
kernel_mailbox(volatile uint32_t *mailbox, volatile uint32_t* counter);
|
||||
|
||||
__global__ void
|
||||
kernel_memcpy(volatile uint8_t* dst, volatile uint8_t* src, size_t length);
|
||||
|
||||
} // namespace villas
|
||||
} // namespace gpu
|
|
@ -223,8 +223,8 @@ IpCoreFactory::make(PCIeCard* card, json_t *json_ips)
|
|||
json_t* json_block;
|
||||
json_object_foreach(json_instance, block_name, json_block) {
|
||||
|
||||
unsigned int base, high, size;
|
||||
int ret = json_unpack(json_block, "{ s: i, s: i, s: i }",
|
||||
json_int_t base, high, size;
|
||||
int ret = json_unpack(json_block, "{ s: I, s: I, s: I }",
|
||||
"baseaddr", &base,
|
||||
"highaddr", &high,
|
||||
"size", &size);
|
||||
|
|
|
@ -132,8 +132,14 @@ Dma::reset()
|
|||
|
||||
|
||||
bool
|
||||
Dma::pingPong(const MemoryBlock& src, const MemoryBlock& dst, size_t len)
|
||||
Dma::memcpy(const MemoryBlock& src, const MemoryBlock& dst, size_t len)
|
||||
{
|
||||
if(len == 0)
|
||||
return true;
|
||||
|
||||
if(not connectLoopback())
|
||||
return false;
|
||||
|
||||
if(this->read(dst, len) == 0)
|
||||
return false;
|
||||
|
||||
|
@ -153,17 +159,14 @@ Dma::pingPong(const MemoryBlock& src, const MemoryBlock& dst, size_t len)
|
|||
size_t
|
||||
Dma::write(const MemoryBlock& mem, size_t len)
|
||||
{
|
||||
// make sure memory is reachable
|
||||
if(not card->mapMemoryBlock(mem)) {
|
||||
logger->error("Memory not accessible by DMA");
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto& mm = MemoryManager::get();
|
||||
|
||||
// user has to make sure that memory is accessible, otherwise this will throw
|
||||
auto translation = mm.getTranslation(busMasterInterfaces[mm2sInterface],
|
||||
mem.getAddrSpaceId());
|
||||
const void* buf = reinterpret_cast<void*>(translation.getLocalAddr(0));
|
||||
|
||||
logger->debug("Write to address: {:p}", buf);
|
||||
return hasScatterGather() ? writeSG(buf, len) : writeSimple(buf, len);
|
||||
}
|
||||
|
||||
|
@ -171,17 +174,14 @@ Dma::write(const MemoryBlock& mem, size_t len)
|
|||
size_t
|
||||
Dma::read(const MemoryBlock& mem, size_t len)
|
||||
{
|
||||
// make sure memory is reachable
|
||||
if(not card->mapMemoryBlock(mem)) {
|
||||
logger->error("Memory not accessible by DMA");
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto& mm = MemoryManager::get();
|
||||
|
||||
// user has to make sure that memory is accessible, otherwise this will throw
|
||||
auto translation = mm.getTranslation(busMasterInterfaces[s2mmInterface],
|
||||
mem.getAddrSpaceId());
|
||||
void* buf = reinterpret_cast<void*>(translation.getLocalAddr(0));
|
||||
|
||||
logger->debug("Read from address: {:p}", buf);
|
||||
return hasScatterGather() ? readSG(buf, len) : readSimple(buf, len);
|
||||
}
|
||||
|
||||
|
@ -350,6 +350,48 @@ Dma::readCompleteSimple()
|
|||
}
|
||||
|
||||
|
||||
bool
|
||||
Dma::makeAccesibleFromVA(const MemoryBlock& mem)
|
||||
{
|
||||
// only symmetric mapping supported currently
|
||||
if(isMemoryBlockAccesible(mem, s2mmInterface) and
|
||||
isMemoryBlockAccesible(mem, mm2sInterface)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// try mapping via FPGA-card (VFIO)
|
||||
if(not card->mapMemoryBlock(mem)) {
|
||||
logger->error("Memory not accessible by DMA");
|
||||
return false;
|
||||
}
|
||||
|
||||
// sanity-check if mapping worked, this shouldn't be neccessary
|
||||
if(not isMemoryBlockAccesible(mem, s2mmInterface) or
|
||||
not isMemoryBlockAccesible(mem, mm2sInterface)) {
|
||||
logger->error("Mapping memory via card didn't work, but reported success?!");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
Dma::isMemoryBlockAccesible(const MemoryBlock& mem, const std::string& interface)
|
||||
{
|
||||
auto& mm = MemoryManager::get();
|
||||
|
||||
try {
|
||||
mm.findPath(getMasterAddrSpaceByInterface(interface), mem.getAddrSpaceId());
|
||||
} catch(const std::out_of_range&) {
|
||||
// not (yet) accessible
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
} // namespace ip
|
||||
} // namespace fpga
|
||||
} // namespace villas
|
||||
|
|
|
@ -42,14 +42,23 @@ AxiPciExpressBridge::init()
|
|||
|
||||
// Throw an exception if the is no bus master interface and thus no
|
||||
// address space we can use for translation -> error
|
||||
const MemoryManager::AddressSpaceId myAddrSpaceid =
|
||||
busMasterInterfaces.at(axiInterface);
|
||||
card->addrSpaceIdHostToDevice = busMasterInterfaces.at(axiInterface);
|
||||
|
||||
// Create an identity mapping from the FPGA card to this IP as an entry
|
||||
// point to all other IPs in the FPGA, because Vivado will generate a
|
||||
// memory view for this bridge that can see all others.
|
||||
MemoryManager::get().createMapping(0x00, 0x00, SIZE_MAX, "PCIeBridge",
|
||||
card->addrSpaceIdHostToDevice, myAddrSpaceid);
|
||||
/* Map PCIe BAR0 via VFIO */
|
||||
const void* bar0_mapped = card->vfioDevice->regionMap(VFIO_PCI_BAR0_REGION_INDEX);
|
||||
if (bar0_mapped == MAP_FAILED) {
|
||||
logger->error("Failed to mmap() BAR0");
|
||||
return false;
|
||||
}
|
||||
|
||||
// determine size of BAR0 region
|
||||
const size_t bar0_size = card->vfioDevice->regionGetSize(VFIO_PCI_BAR0_REGION_INDEX);
|
||||
|
||||
// create a mapping from process address space to the FPGA card via vfio
|
||||
mm.createMapping(reinterpret_cast<uintptr_t>(bar0_mapped),
|
||||
0, bar0_size, "VFIO-H2D",
|
||||
mm.getProcessAddressSpace(),
|
||||
card->addrSpaceIdHostToDevice);
|
||||
|
||||
|
||||
/* Make PCIe (IOVA) address space available to FPGA via BAR0 */
|
||||
|
@ -63,6 +72,102 @@ AxiPciExpressBridge::init()
|
|||
card->addrSpaceIdDeviceToHost =
|
||||
mm.getOrCreateAddressSpace(addrSpaceNameDeviceToHost);
|
||||
|
||||
auto pciAddrSpaceId = mm.getPciAddressSpace();
|
||||
|
||||
struct pci_region* pci_regions = nullptr;
|
||||
size_t num_regions = pci_get_regions(card->pdev, &pci_regions);
|
||||
|
||||
for(size_t i = 0; i < num_regions; i++) {
|
||||
const size_t region_size = pci_regions[i].end - pci_regions[i].start + 1;
|
||||
|
||||
char barName[] = "BARx";
|
||||
barName[3] = '0' + pci_regions[i].num;
|
||||
auto pciBar = pcieToAxiTranslations.at(barName);
|
||||
|
||||
|
||||
logger->info("PCI-BAR{}: bus addr={:#x} size={:#x}",
|
||||
pci_regions[i].num, pci_regions[i].start, region_size);
|
||||
logger->info("PCI-BAR{}: AXI translation offset {:#x}",
|
||||
i, pciBar.translation);
|
||||
|
||||
mm.createMapping(pci_regions[i].start, pciBar.translation, region_size,
|
||||
std::string("PCI-") + barName,
|
||||
pciAddrSpaceId, card->addrSpaceIdHostToDevice);
|
||||
|
||||
}
|
||||
|
||||
if(pci_regions != nullptr) {
|
||||
logger->debug("freeing pci regions");
|
||||
free(pci_regions);
|
||||
}
|
||||
|
||||
|
||||
for(auto& [barName, axiBar] : axiToPcieTranslations) {
|
||||
logger->info("AXI-{}: bus addr={:#x} size={:#x}",
|
||||
barName, axiBar.base, axiBar.size);
|
||||
logger->info("AXI-{}: PCI translation offset: {:#x}",
|
||||
barName, axiBar.translation);
|
||||
|
||||
auto barXAddrSpaceName = mm.getSlaveAddrSpaceName(getInstanceName(), barName);
|
||||
auto barXAddrSpaceId = mm.getOrCreateAddressSpace(barXAddrSpaceName);
|
||||
|
||||
// base is already incorporated into mapping of each IP by Vivado, so
|
||||
// the mapping src has to be 0
|
||||
mm.createMapping(0, axiBar.translation, axiBar.size,
|
||||
std::string("AXI-") + barName,
|
||||
barXAddrSpaceId, pciAddrSpaceId);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
AxiPciExpressBridgeFactory::configureJson(IpCore& ip, json_t* json_ip)
|
||||
{
|
||||
auto logger = getLogger();
|
||||
auto& pcie = reinterpret_cast<AxiPciExpressBridge&>(ip);
|
||||
|
||||
for(auto barType : std::list<std::string>{"axi_bars", "pcie_bars"}) {
|
||||
json_t* json_bars = json_object_get(json_ip, barType.c_str());
|
||||
if(not json_is_object(json_bars)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
json_t* json_bar;
|
||||
const char* bar_name;
|
||||
json_object_foreach(json_bars, bar_name, json_bar) {
|
||||
unsigned int translation;
|
||||
int ret = json_unpack(json_bar, "{ s: i }", "translation", &translation);
|
||||
if(ret != 0) {
|
||||
logger->error("Cannot parse {}/{}", barType, bar_name);
|
||||
return false;
|
||||
}
|
||||
|
||||
if(barType == "axi_bars") {
|
||||
json_int_t base, high, size;
|
||||
int ret = json_unpack(json_bar, "{ s: I, s: I, s: I }",
|
||||
"baseaddr", &base,
|
||||
"highaddr", &high,
|
||||
"size", &size);
|
||||
if(ret != 0) {
|
||||
logger->error("Cannot parse {}/{}", barType, bar_name);
|
||||
return false;
|
||||
}
|
||||
|
||||
pcie.axiToPcieTranslations[bar_name] = {
|
||||
.base = static_cast<uintptr_t>(base),
|
||||
.size = static_cast<size_t>(size),
|
||||
.translation = translation
|
||||
};
|
||||
|
||||
} else {
|
||||
pcie.pcieToAxiTranslations[bar_name] = {
|
||||
.translation = translation
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -254,10 +254,78 @@ struct pci_device * pci_lookup_device(struct pci *p, struct pci_device *f)
|
|||
return list_search(&p->devices, (cmp_cb_t) pci_device_compare, (void *) f);
|
||||
}
|
||||
|
||||
size_t pci_get_regions(const struct pci_device *d, struct pci_region** regions)
|
||||
{
|
||||
FILE* f;
|
||||
char sysfs[1024];
|
||||
|
||||
assert(regions != NULL);
|
||||
|
||||
snprintf(sysfs, sizeof(sysfs), "%s/bus/pci/devices/%04x:%02x:%02x.%x/resource",
|
||||
SYSFS_PATH, d->slot.domain, d->slot.bus, d->slot.device, d->slot.function);
|
||||
|
||||
f = fopen(sysfs, "r");
|
||||
if (!f)
|
||||
serror("Failed to open resource mapping %s", sysfs);
|
||||
|
||||
struct pci_region _regions[8];
|
||||
struct pci_region* cur_region = _regions;
|
||||
size_t valid_regions = 0;
|
||||
|
||||
ssize_t bytesRead;
|
||||
char* line = NULL;
|
||||
size_t len = 0;
|
||||
|
||||
int region = 0;
|
||||
// cap to 8 regions, just because we don't know how many may exist
|
||||
while(region < 8 && (bytesRead = getline(&line, &len, f)) != -1) {
|
||||
unsigned long long tokens[3];
|
||||
char* s = line;
|
||||
for(int i = 0; i < 3; i++) {
|
||||
char* end;
|
||||
tokens[i] = strtoull(s, &end, 16);
|
||||
if(s == end) {
|
||||
printf("Error parsing line %d of %s\n", region + 1, sysfs);
|
||||
tokens[0] = tokens[1] = 0; // mark invalid
|
||||
break;
|
||||
}
|
||||
s = end;
|
||||
}
|
||||
|
||||
free(line);
|
||||
|
||||
// required for getline() to allocate a new buffer on the next iteration
|
||||
line = NULL;
|
||||
len = 0;
|
||||
|
||||
if(tokens[0] != tokens[1]) {
|
||||
// this is a valid region
|
||||
cur_region->num = region;
|
||||
cur_region->start = tokens[0];
|
||||
cur_region->end = tokens[1];
|
||||
cur_region->flags = tokens[2];
|
||||
cur_region++;
|
||||
valid_regions++;
|
||||
}
|
||||
|
||||
region++;
|
||||
}
|
||||
|
||||
if(valid_regions > 0) {
|
||||
const size_t len = valid_regions * sizeof (struct pci_region);
|
||||
*regions = malloc(len);
|
||||
memcpy(*regions, _regions, len);
|
||||
}
|
||||
|
||||
return valid_regions;
|
||||
}
|
||||
|
||||
|
||||
int pci_get_driver(const struct pci_device *d, char *buf, size_t buflen)
|
||||
{
|
||||
int ret;
|
||||
char sysfs[1024], syml[1024];
|
||||
memset(syml, 0, sizeof(syml));
|
||||
|
||||
snprintf(sysfs, sizeof(sysfs), "%s/bus/pci/devices/%04x:%02x:%02x.%x/driver", SYSFS_PATH,
|
||||
d->slot.domain, d->slot.bus, d->slot.device, d->slot.function);
|
||||
|
@ -305,6 +373,7 @@ int pci_get_iommu_group(const struct pci_device *d)
|
|||
{
|
||||
int ret;
|
||||
char *group, link[1024], sysfs[1024];
|
||||
memset(link, 0, sizeof(link));
|
||||
|
||||
snprintf(sysfs, sizeof(sysfs), "%s/bus/pci/devices/%04x:%02x:%02x.%x/iommu_group", SYSFS_PATH,
|
||||
d->slot.domain, d->slot.bus, d->slot.device, d->slot.function);
|
||||
|
|
|
@ -87,7 +87,7 @@ VfioContainer::VfioContainer()
|
|||
}
|
||||
|
||||
/* Open VFIO API */
|
||||
fd = open(VFIO_DEV("vfio"), O_RDWR);
|
||||
fd = open(VFIO_DEV, O_RDWR);
|
||||
if (fd < 0) {
|
||||
logger->error("Failed to open VFIO container");
|
||||
throw std::exception();
|
||||
|
@ -108,12 +108,25 @@ VfioContainer::VfioContainer()
|
|||
logger->error("Failed to get VFIO extensions");
|
||||
throw std::exception();
|
||||
}
|
||||
else if (ret > 0)
|
||||
else if (ret > 0) {
|
||||
extensions |= (1 << i);
|
||||
}
|
||||
}
|
||||
|
||||
logger->debug("Version: {:#x}", version);
|
||||
hasIommu = false;
|
||||
|
||||
if(not (extensions & (1 << VFIO_NOIOMMU_IOMMU))) {
|
||||
if(not (extensions & (1 << VFIO_TYPE1_IOMMU))) {
|
||||
logger->error("No supported IOMMU extension found");
|
||||
throw std::exception();
|
||||
} else {
|
||||
hasIommu = true;
|
||||
}
|
||||
}
|
||||
|
||||
logger->debug("Version: {:#x}", version);
|
||||
logger->debug("Extensions: {:#x}", extensions);
|
||||
logger->debug("IOMMU: {}", hasIommu ? "yes" : "no");
|
||||
}
|
||||
|
||||
|
||||
|
@ -289,7 +302,7 @@ VfioContainer::attachDevice(const pci_device* pdev)
|
|||
}
|
||||
|
||||
/* Get IOMMU group of device */
|
||||
int index = pci_get_iommu_group(pdev);
|
||||
int index = isIommuEnabled() ? pci_get_iommu_group(pdev) : 0;
|
||||
if (index < 0) {
|
||||
logger->error("Failed to get IOMMU group of device");
|
||||
throw std::exception();
|
||||
|
@ -319,6 +332,11 @@ VfioContainer::memoryMap(uintptr_t virt, uintptr_t phys, size_t length)
|
|||
{
|
||||
int ret;
|
||||
|
||||
if(not hasIommu) {
|
||||
logger->error("DMA mapping not supported without IOMMU");
|
||||
return UINTPTR_MAX;
|
||||
}
|
||||
|
||||
if (length & 0xFFF) {
|
||||
length += 0x1000;
|
||||
length &= ~0xFFF;
|
||||
|
@ -363,6 +381,10 @@ VfioContainer::memoryUnmap(uintptr_t phys, size_t length)
|
|||
{
|
||||
int ret;
|
||||
|
||||
if(not hasIommu) {
|
||||
return true;
|
||||
}
|
||||
|
||||
struct vfio_iommu_type1_dma_unmap dmaUnmap;
|
||||
dmaUnmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
|
||||
dmaUnmap.flags = 0;
|
||||
|
@ -390,7 +412,7 @@ VfioContainer::getOrAttachGroup(int index)
|
|||
}
|
||||
|
||||
// group not yet part of this container, so acquire ownership
|
||||
auto group = VfioGroup::attach(fd, index);
|
||||
auto group = VfioGroup::attach(*this, index);
|
||||
if(not group) {
|
||||
logger->error("Failed to attach to IOMMU group: {}", index);
|
||||
throw std::exception();
|
||||
|
@ -516,6 +538,7 @@ VfioDevice::pciHotReset()
|
|||
|
||||
const size_t reset_infolen = sizeof(struct vfio_pci_hot_reset_info) +
|
||||
sizeof(struct vfio_pci_dependent_device) * 64;
|
||||
|
||||
auto reset_info = reinterpret_cast<struct vfio_pci_hot_reset_info*>
|
||||
(calloc(1, reset_infolen));
|
||||
|
||||
|
@ -540,6 +563,8 @@ VfioDevice::pciHotReset()
|
|||
}
|
||||
}
|
||||
|
||||
free(reset_info);
|
||||
|
||||
const size_t resetlen = sizeof(struct vfio_pci_hot_reset) +
|
||||
sizeof(int32_t) * 1;
|
||||
auto reset = reinterpret_cast<struct vfio_pci_hot_reset*>
|
||||
|
@ -549,10 +574,15 @@ VfioDevice::pciHotReset()
|
|||
reset->count = 1;
|
||||
reset->group_fds[0] = this->group.fd;
|
||||
|
||||
const bool success = ioctl(this->fd, VFIO_DEVICE_PCI_HOT_RESET, reset) == 0;
|
||||
int ret = ioctl(this->fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
|
||||
const bool success = (ret == 0);
|
||||
|
||||
free(reset);
|
||||
free(reset_info);
|
||||
|
||||
if(not success and not group.container->isIommuEnabled()) {
|
||||
logger->info("PCI hot reset failed, but this is expected without IOMMU");
|
||||
return true;
|
||||
}
|
||||
|
||||
return success;
|
||||
}
|
||||
|
@ -713,13 +743,18 @@ VfioGroup::~VfioGroup()
|
|||
|
||||
|
||||
std::unique_ptr<VfioGroup>
|
||||
VfioGroup::attach(int containerFd, int groupIndex)
|
||||
VfioGroup::attach(VfioContainer& container, int groupIndex)
|
||||
{
|
||||
std::unique_ptr<VfioGroup> group { new VfioGroup(groupIndex) };
|
||||
|
||||
group->container = &container;
|
||||
|
||||
/* Open group fd */
|
||||
std::stringstream groupPath;
|
||||
groupPath << VFIO_DEV("") << groupIndex;
|
||||
groupPath << VFIO_PATH
|
||||
<< (container.isIommuEnabled() ? "" : "noiommu-")
|
||||
<< groupIndex;
|
||||
|
||||
group->fd = open(groupPath.str().c_str(), O_RDWR);
|
||||
if (group->fd < 0) {
|
||||
logger->error("Failed to open VFIO group {}", group->index);
|
||||
|
@ -729,18 +764,20 @@ VfioGroup::attach(int containerFd, int groupIndex)
|
|||
logger->debug("VFIO group {} (fd {}) has path {}",
|
||||
groupIndex, group->fd, groupPath.str());
|
||||
|
||||
int ret;
|
||||
|
||||
/* Claim group ownership */
|
||||
ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &containerFd);
|
||||
int ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container.getFd());
|
||||
if (ret < 0) {
|
||||
logger->error("Failed to attach VFIO group {} to container fd {} (error {})",
|
||||
group->index, containerFd, ret);
|
||||
group->index, container.getFd(), ret);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/* Set IOMMU type */
|
||||
ret = ioctl(containerFd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
|
||||
int iommu_type = container.isIommuEnabled() ?
|
||||
VFIO_TYPE1_IOMMU :
|
||||
VFIO_NOIOMMU_IOMMU;
|
||||
|
||||
ret = ioctl(container.getFd(), VFIO_SET_IOMMU, iommu_type);
|
||||
if (ret < 0) {
|
||||
logger->error("Failed to set IOMMU type of container: {}", ret);
|
||||
return nullptr;
|
||||
|
|
|
@ -255,4 +255,27 @@ for bram in brams:
|
|||
if instance in ips:
|
||||
ips[instance]['size'] = int(size)
|
||||
|
||||
pcies = root.xpath('.//MODULE[@MODTYPE="axi_pcie"]')
|
||||
for pcie in pcies:
|
||||
instance = pcie.get('INSTANCE')
|
||||
axi_bars = ips[instance].setdefault('axi_bars', {})
|
||||
pcie_bars = ips[instance].setdefault('pcie_bars', {})
|
||||
|
||||
for from_bar, to_bar, from_bars in (('AXIBAR', 'PCIEBAR', axi_bars), ('PCIEBAR', 'AXIBAR', pcie_bars)):
|
||||
from_bar_num = int(pcie.find('.//PARAMETER[@NAME="C_{}_NUM"]'.format(from_bar)).get('VALUE'))
|
||||
|
||||
for i in range(0, from_bar_num):
|
||||
from_bar_to_bar_offset = int(pcie.find('.//PARAMETER[@NAME="C_{}2{}_{}"]'.format(from_bar, to_bar, i)).get('VALUE'), 16)
|
||||
from_bars['BAR{}'.format(i)] = { 'translation': from_bar_to_bar_offset }
|
||||
|
||||
if from_bar == 'AXIBAR':
|
||||
axi_bar_lo = int(pcie.find('.//PARAMETER[@NAME="C_{}_{}"]'.format(from_bar, i)).get('VALUE'), 16)
|
||||
axi_bar_hi = int(pcie.find('.//PARAMETER[@NAME="C_{}_HIGHADDR_{}"]'.format(from_bar, i)).get('VALUE'), 16)
|
||||
axi_bar_size = axi_bar_hi - axi_bar_lo + 1
|
||||
|
||||
axi_bar = from_bars['BAR{}'.format(i)]
|
||||
axi_bar['baseaddr'] = axi_bar_lo
|
||||
axi_bar['highaddr'] = axi_bar_hi
|
||||
axi_bar['size'] = axi_bar_size
|
||||
|
||||
print(json.dumps(ips, indent=2))
|
||||
|
|
|
@ -12,6 +12,10 @@ set(SOURCES
|
|||
graph.cpp
|
||||
)
|
||||
|
||||
if(CMAKE_CUDA_COMPILER)
|
||||
list(APPEND SOURCES gpu.cpp)
|
||||
endif()
|
||||
|
||||
add_executable(unit-tests ${SOURCES})
|
||||
|
||||
find_package(Criterion REQUIRED)
|
||||
|
|
|
@ -26,38 +26,52 @@ Test(fpga, dma, .description = "DMA")
|
|||
|
||||
auto dma = reinterpret_cast<villas::fpga::ip::Dma&>(*ip);
|
||||
|
||||
if(not dma.connectLoopback()) {
|
||||
if(not dma.loopbackPossible()) {
|
||||
logger->info("Loopback test not possible for {}", *ip);
|
||||
continue;
|
||||
}
|
||||
|
||||
count++;
|
||||
|
||||
if(not dma.loopbackPossible()) {
|
||||
logger->info("Loopback test not possible for {}", *ip);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Simple DMA can only transfer up to 4 kb due to PCIe page size burst
|
||||
// limitation
|
||||
size_t len = 4 * (1 << 10);
|
||||
|
||||
// find a block RAM IP to write to
|
||||
auto bramIp = state.cards.front()->lookupIp(villas::fpga::Vlnv("xilinx.com:ip:axi_bram_ctrl:"));
|
||||
auto bram = reinterpret_cast<villas::fpga::ip::Bram*>(bramIp);
|
||||
cr_assert_not_null(bram, "Couldn't find BRAM");
|
||||
|
||||
// Simple DMA can only transfer up to 4 kb due to PCIe page size burst
|
||||
// limitation
|
||||
size_t len = 4 * (1 << 10);
|
||||
|
||||
/* Allocate memory to use with DMA */
|
||||
auto src = villas::HostRam::getAllocator().allocate<char>(len);
|
||||
auto dst = bram->getAllocator().allocate<char>(len);
|
||||
auto src = villas::HostDmaRam::getAllocator().allocate<char>(len);
|
||||
auto dst = villas::HostDmaRam::getAllocator().allocate<char>(len);
|
||||
|
||||
/* ... only works with IOMMU enabled currently */
|
||||
// auto src = bram->getAllocator().allocate<char>(len);
|
||||
// auto dst = bram->getAllocator().allocate<char>(len);
|
||||
|
||||
/* ... only works with IOMMU enabled currently */
|
||||
// auto src = villas::HostRam::getAllocator().allocate<char>(len);
|
||||
// auto dst = villas::HostRam::getAllocator().allocate<char>(len);
|
||||
|
||||
/* Make sure memory is accessible for DMA */
|
||||
cr_assert(dma.makeAccesibleFromVA(src.getMemoryBlock()),
|
||||
"Source memory not accessible for DMA");
|
||||
cr_assert(dma.makeAccesibleFromVA(dst.getMemoryBlock()),
|
||||
"Destination memory not accessible for DMA");
|
||||
|
||||
/* Get new random data */
|
||||
const size_t lenRandom = read_random(&src, len);
|
||||
cr_assert(len == lenRandom, "Failed to get random data");
|
||||
|
||||
|
||||
/* Start transfer */
|
||||
cr_assert(dma.pingPong(src.getMemoryBlock(), dst.getMemoryBlock(), len),
|
||||
cr_assert(dma.memcpy(src.getMemoryBlock(), dst.getMemoryBlock(), len),
|
||||
"DMA ping pong failed");
|
||||
|
||||
|
||||
/* Compare data */
|
||||
cr_assert(memcmp(&src, &dst, len) == 0, "Data not equal");
|
||||
|
||||
|
|
129
fpga/tests/gpu.cpp
Normal file
129
fpga/tests/gpu.cpp
Normal file
|
@ -0,0 +1,129 @@
|
|||
#include <criterion/criterion.h>
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
#include <villas/log.hpp>
|
||||
#include <villas/fpga/card.hpp>
|
||||
#include <villas/fpga/ips/dma.hpp>
|
||||
#include <villas/fpga/ips/bram.hpp>
|
||||
|
||||
#include <villas/utils.h>
|
||||
|
||||
#include "global.hpp"
|
||||
|
||||
#include <villas/memory.hpp>
|
||||
#include <villas/gpu.hpp>
|
||||
|
||||
|
||||
Test(fpga, gpu_dma, .description = "GPU DMA tests")
|
||||
{
|
||||
auto logger = loggerGetOrCreate("unittest:dma");
|
||||
|
||||
auto& card = state.cards.front();
|
||||
|
||||
villas::Plugin* plugin = villas::Plugin::lookup(villas::Plugin::Type::Gpu, "");
|
||||
auto gpuPlugin = dynamic_cast<villas::gpu::GpuFactory*>(plugin);
|
||||
cr_assert_not_null(gpuPlugin, "No GPU plugin found");
|
||||
|
||||
auto gpus = gpuPlugin->make();
|
||||
cr_assert(gpus.size() > 0, "No GPUs found");
|
||||
|
||||
// just get first cpu
|
||||
auto& gpu = gpus.front();
|
||||
|
||||
size_t count = 0;
|
||||
for(auto& ip : card->ips) {
|
||||
// skip non-dma IPs
|
||||
if(*ip != villas::fpga::Vlnv("xilinx.com:ip:axi_bram_ctrl:"))
|
||||
continue;
|
||||
|
||||
logger->info("Testing {}", *ip);
|
||||
|
||||
auto bram = reinterpret_cast<villas::fpga::ip::Bram*>(ip.get());
|
||||
cr_assert_not_null(bram, "Couldn't find BRAM");
|
||||
|
||||
count++;
|
||||
|
||||
size_t len = 4 * (1 << 10);
|
||||
|
||||
/* Allocate memory to use with DMA */
|
||||
|
||||
auto bram0 = bram->getAllocator().allocate<char>(len);
|
||||
auto bram1 = bram->getAllocator().allocate<char>(len);
|
||||
|
||||
gpu->makeAccessibleFromPCIeOrHostRam(bram0.getMemoryBlock());
|
||||
gpu->makeAccessibleFromPCIeOrHostRam(bram1.getMemoryBlock());
|
||||
|
||||
auto hostRam0 = villas::HostRam::getAllocator().allocate<char>(len);
|
||||
auto hostRam1 = villas::HostRam::getAllocator().allocate<char>(len);
|
||||
|
||||
gpu->makeAccessibleFromPCIeOrHostRam(hostRam0.getMemoryBlock());
|
||||
gpu->makeAccessibleFromPCIeOrHostRam(hostRam1.getMemoryBlock());
|
||||
|
||||
auto dmaRam0 = villas::HostDmaRam::getAllocator().allocate<char>(len);
|
||||
auto dmaRam1 = villas::HostDmaRam::getAllocator().allocate<char>(len);
|
||||
|
||||
gpu->makeAccessibleFromPCIeOrHostRam(dmaRam0.getMemoryBlock());
|
||||
gpu->makeAccessibleFromPCIeOrHostRam(dmaRam1.getMemoryBlock());
|
||||
|
||||
auto gpuMem0 = gpu->getAllocator().allocate<char>(64 << 10);
|
||||
auto gpuMem1 = gpu->getAllocator().allocate<char>(64 << 10);
|
||||
|
||||
gpu->makeAccessibleToPCIeAndVA(gpuMem0.getMemoryBlock());
|
||||
gpu->makeAccessibleToPCIeAndVA(gpuMem1.getMemoryBlock());
|
||||
|
||||
|
||||
// auto& src = bram0;
|
||||
// auto& dst = bram1;
|
||||
|
||||
// auto& src = hostRam0;
|
||||
// auto& dst = hostRam1;
|
||||
|
||||
auto& src = dmaRam0;
|
||||
// auto& dst = dmaRam1;
|
||||
|
||||
// auto& src = gpuMem0;
|
||||
auto& dst = gpuMem1;
|
||||
|
||||
|
||||
std::list<std::pair<std::string, std::function<void()>>> memcpyFuncs = {
|
||||
{"cudaMemcpy", [&]() {gpu->memcpySync(src.getMemoryBlock(), dst.getMemoryBlock(), len);}},
|
||||
{"CUDA kernel", [&]() {gpu->memcpyKernel(src.getMemoryBlock(), dst.getMemoryBlock(), len);}},
|
||||
};
|
||||
|
||||
auto dmaIp = card->lookupIp(villas::fpga::Vlnv("xilinx.com:ip:axi_dma:"));
|
||||
auto dma = dynamic_cast<villas::fpga::ip::Dma*>(dmaIp);
|
||||
|
||||
if(dma != nullptr and dma->connectLoopback()) {
|
||||
memcpyFuncs.push_back({
|
||||
"DMA memcpy", [&]() {
|
||||
if(not dma->makeAccesibleFromVA(src.getMemoryBlock()) or
|
||||
not dma->makeAccesibleFromVA(dst.getMemoryBlock())) {
|
||||
return;
|
||||
}
|
||||
dma->memcpy(src.getMemoryBlock(), dst.getMemoryBlock(), len);
|
||||
}});
|
||||
}
|
||||
|
||||
for(auto& [name, memcpyFunc] : memcpyFuncs) {
|
||||
logger->info("Testing {}", name);
|
||||
|
||||
/* Get new random data */
|
||||
const size_t lenRandom = read_random(&src, len);
|
||||
cr_assert(len == lenRandom, "Failed to get random data");
|
||||
|
||||
memcpyFunc();
|
||||
const bool success = memcmp(&src, &dst, len) == 0;
|
||||
|
||||
logger->info(" {}", success ?
|
||||
TXT_GREEN("Passed") :
|
||||
TXT_RED("Failed"));
|
||||
}
|
||||
|
||||
villas::MemoryManager::get().dump();
|
||||
}
|
||||
|
||||
|
||||
cr_assert(count > 0, "No BRAM found");
|
||||
}
|
1
fpga/thirdparty/udmabuf
vendored
Submodule
1
fpga/thirdparty/udmabuf
vendored
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit 65762ca3333cb230e3b80ecf4cb5e605390474f2
|
Loading…
Add table
Reference in a new issue