/** Benchmarks for VILLASfpga * * @author Steffen Vogel * @copyright 2015-2016, Steffen Vogel * This file is part of VILLASnode. All Rights Reserved. Proprietary and confidential. * Unauthorized copying of this file, via any medium is strictly prohibited. **********************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include "config.h" #include "config-fpga.h" /* Some hard-coded configuration for the benchmarks */ #define BENCH_DM 3 // 1 FIFO // 2 DMA SG // 3 DMA Simple #define BENCH_RUNS 3000000 #define BENCH_WARMUP 100 #define BENCH_DM_EXP_MIN 0 #define BENCH_DM_EXP_MAX 20 int fpga_benchmark_datamover(struct fpga *f); int fpga_benchmark_jitter(struct fpga *f); int fpga_benchmark_memcpy(struct fpga *f); int fpga_benchmark_overruns(struct fpga *f); int fpga_benchmark_latency(struct fpga *f); static int intc_flags = 0; static struct utsname uts; int fpga_benchmarks(int argc, char *argv[], struct fpga *f) { int ret; struct bench { const char *name; int (*func)(struct fpga *f); } benchmarks[] = { { "datamover", fpga_benchmark_datamover }, { "jitter", fpga_benchmark_jitter }, { "memcpy", fpga_benchmark_memcpy }, { "overruns", fpga_benchmark_overruns }, { "latency", fpga_benchmark_latency } }; if (argc < 2) error("Usage: fpga benchmark (bench)"); struct bench *bench = NULL; for (int i = 0; i < ARRAY_LEN(benchmarks); i++) { if (strcmp(benchmarks[i].name, argv[1]) == 0) { bench = &benchmarks[i]; break; } } if (bench == NULL) error("There is no benchmark named: %s", argv[1]); ret = uname(&uts); if (ret) return -1; again: ret = bench->func(f); if (ret) error("Benchmark %s failed", bench->name); /* Rerun test with polling */ if (intc_flags == 0) { intc_flags |= INTC_POLLING; getchar(); goto again; } return -1; } extern int dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc); extern int dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info); extern int dgetri_(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info); static int lapack_generate_workload(int N, double *C) { double *A = alloc(N * N * sizeof(double)); srand(time(NULL)); for (int i = 0; i < N * N; i++) A[i] = 100 * (double) rand() / RAND_MAX + 1; char transA = 'T'; char transB = 'N'; double alpha = 1; double beta = 1; /* C = A' * A, to get an invertible matrix */ dgemm_(&transA, &transB, &N, &N, &N, &alpha, A, &N, A, &N, &beta, C, &N); free(A); return 0; } static int lapack_workload(int N, double *A) { int info = 0; int lworkspace = N; int ipiv[N]; double workspace[N]; dgetrf_(&N, &N, A, &N, ipiv, &info); if (info > 0) error("Failed to pivot matrix"); dgetri_(&N, A, &N, ipiv, workspace, &lworkspace, &info); if (info > 0) error("Failed to LU factorized matrix"); return 0; } int fpga_benchmark_overruns(struct fpga *f) { struct ip *rtds, *dm; dm = list_lookup(&f->ips, "dma_1"); rtds = list_lookup(&f->ips, "rtds_axis_0"); if (!rtds || !f->intc) return -1; int ret; float period = 50e-6; int runs = 1.0 / period; int overruns; info("runs = %u", runs); switch_connect(f->sw, dm, rtds); switch_connect(f->sw, rtds, dm); intc_enable(f->intc, (1 << (dm->irq + 1 )), intc_flags); /* Dump results */ char fn[256]; snprintf(fn, sizeof(fn), "results/overruns_lu_rtds_axis_%s_%s.dat", intc_flags & INTC_POLLING ? "polling" : "irq", uts.release); FILE *g = fopen(fn, "w"); fprintf(g, "# period = %f\n", period); fprintf(g, "# runs = %u\n", runs); struct dma_mem mem; ret = dma_alloc(dm, &mem, 0x1000, 0); if (ret) error("Failed to allocate DMA memory"); uint32_t *data_rx = (uint32_t *) mem.base_virt; uint32_t *data_tx = (uint32_t *) mem.base_virt + 0x200; uint64_t total, start, stop; for (int p = 3; p < 45; p++) { double *A = alloc(p*p*sizeof(double)); lapack_generate_workload(p, A); overruns = 0; total = 0; for (int i = 0; i < 2000; i++) { dma_read(dm, mem.base_phys, 0x200); dma_read_complete(dm, NULL, NULL); } for (int i = 0; i < runs + BENCH_WARMUP; i++) { dma_read(dm, mem.base_phys, 0x200); start = rdtscp(); lapack_workload(p, A); stop = rdtscp(); dma_read_complete(dm, NULL, NULL); /* Send data to rtds */ data_tx[0] = i; dma_write(dm, mem.base_phys + 0x200, 64 * sizeof(data_tx[0])); if (i < BENCH_WARMUP) continue; if (i - data_rx[0] > 2) overruns++; total += stop - start; } free(A); info("iter = %u clks = %ju overruns = %u", p, total / runs, overruns); fprintf(g, "%u %ju %u\n", p, total / runs, overruns); if (overruns >= runs) break; } fclose(g); return 0; } int fpga_benchmark_jitter(struct fpga *f) { int ret; struct ip *tmr; tmr = list_lookup(&f->ips, "timer_0"); if (!tmr || !f->intc) return -1; XTmrCtr *xtmr = &tmr->timer.inst; ret = intc_enable(f->intc, (1 << tmr->irq), intc_flags); if (ret) error("Failed to enable interrupt"); float period = 50e-6; int runs = 300.0 / period; int *hist = alloc(8 << 20); XTmrCtr_SetOptions(xtmr, 0, XTC_INT_MODE_OPTION | XTC_EXT_COMPARE_OPTION | XTC_DOWN_COUNT_OPTION | XTC_AUTO_RELOAD_OPTION); XTmrCtr_SetResetValue(xtmr, 0, period * AXI_HZ); XTmrCtr_Start(xtmr, 0); uint64_t end, start = rdtscp(); for (int i = 0; i < runs; i++) { uint64_t cnt = intc_wait(f->intc, tmr->irq); if (cnt != 1) warn("fail"); /* Ackowledge IRQ */ XTmrCtr_WriteReg((uintptr_t) f->map + tmr->baseaddr, 0, XTC_TCSR_OFFSET, XTmrCtr_ReadReg((uintptr_t) f->map + tmr->baseaddr, 0, XTC_TCSR_OFFSET)); end = rdtscp(); hist[i] = end - start; start = end; } XTmrCtr_Stop(xtmr, 0); char fn[256]; snprintf(fn, sizeof(fn), "results/jitter_%s_%s.dat", intc_flags & INTC_POLLING ? "polling" : "irq", uts.release); FILE *g = fopen(fn, "w"); for (int i = 0; i < runs; i++) fprintf(g, "%u\n", hist[i]); fclose(g); free(hist); ret = intc_disable(f->intc, (1 << tmr->irq)); if (ret) error("Failed to disable interrupt"); return 0; } int fpga_benchmark_latency(struct fpga *f) { int ret; uint64_t start, end; if (!f->intc) return -1; int runs = 1000000; int hist[runs]; ret = intc_enable(f->intc, 0x100, intc_flags); if (ret) error("Failed to enable interrupts"); for (int i = 0; i < runs; i++) { start = rdtscp(); XIntc_Out32((uintptr_t) f->map + f->intc->baseaddr + XIN_ISR_OFFSET, 0x100); intc_wait(f->intc, 8); end = rdtscp(); hist[i] = end - start; } char fn[256]; snprintf(fn, sizeof(fn), "results/latency_%s_%s.dat", intc_flags & INTC_POLLING ? "polling" : "irq", uts.release); FILE *g = fopen(fn, "w"); for (int i = 0; i < runs; i++) fprintf(g, "%u\n", hist[i]); fclose(g); ret = intc_disable(f->intc, 0x100); if (ret) error("Failed to disable interrupt"); return 0; } int fpga_benchmark_datamover(struct fpga *f) { int ret; struct ip *dm; struct dma_mem mem, src, dst; #if BENCH_DM == 1 char *dm_name = "fifo_mm_s_0"; #elif BENCH_DM == 2 char *dm_name = "dma_0"; #elif BENCH_DM == 3 char *dm_name = "dma_1"; #endif dm = list_lookup(&f->ips, dm_name); if (!dm) error("Unknown datamover"); ret = switch_connect(f->sw, dm, dm); if (ret) error("Failed to configure switch"); ret = intc_enable(f->intc, (1 << dm->irq) | (1 << (dm->irq + 1)), intc_flags); if (ret) error("Failed to enable interrupt"); /* Allocate DMA memory */ ret = dma_alloc(dm, &mem, 2 * (1 << BENCH_DM_EXP_MAX), 0); if (ret) error("Failed to allocate DMA memory"); ret = dma_mem_split(&mem, &src, &dst); if (ret) return -1; /* Open file for results */ char fn[256]; snprintf(fn, sizeof(fn), "results/datamover_%s_%s_%s.dat", dm_name, intc_flags & INTC_POLLING ? "polling" : "irq", uts.release); FILE *g = fopen(fn, "w"); for (int exp = BENCH_DM_EXP_MIN; exp <= BENCH_DM_EXP_MAX; exp++) { uint64_t start, stop, total = 0, len = 1 << exp; #if BENCH_DM == 1 if (exp > 11) break; /* FIFO and Simple DMA are limited to 4kb */ #elif BENCH_DM == 3 if (exp >= 12) break; /* FIFO and Simple DMA are limited to 4kb */ #endif read_random(src.base_virt, len); memset(dst.base_virt, 0, len); info("Start DM bench: len=%#jx", len); uint64_t runs = BENCH_RUNS >> exp; for (int i = 0; i < runs + BENCH_WARMUP; i++) { start = rdtscp(); #if BENCH_DM == 1 ssize_t ret; ret = fifo_write(dm, src.base_virt, len); if (ret < 0) error("Failed write to FIFO with len = %zu", len); ret = fifo_read(dm, dst.base_virt, dst.len); if (ret < 0) error("Failed read from FIFO with len = %zu", len); #else ret = dma_ping_pong(dm, src.base_phys, dst.base_phys, len); if (ret) error("DMA ping pong failed"); #endif stop = rdtscp(); if (memcmp(src.base_virt, dst.base_virt, len)) warn("Compare failed"); if (i > BENCH_WARMUP) total += stop - start; } info("exp %u avg %lu", exp, total / runs); fprintf(g, "%lu %lu\n", len, total / runs); } fclose(g); ret = switch_disconnect(f->sw, dm, dm); if (ret) error("Failed to configure switch"); ret = dma_free(dm, &mem); if (ret) error("Failed to release DMA memory"); ret = intc_disable(f->intc, (1 << dm->irq) | (1 << (dm->irq + 1))); if (ret) error("Failed to enable interrupt"); return 0; } int fpga_benchmark_memcpy(struct fpga *f) { char *map = f->map + 0x200000; uint32_t *mapi = (uint32_t *) map; char fn[256]; snprintf(fn, sizeof(fn), "results/bar0_%s_%s.dat", intc_flags & INTC_POLLING ? "polling" : "irq", uts.release); FILE *g = fopen(fn, "w"); fprintf(g, "# bytes cycles\n"); uint32_t dummy = 0; for (int exp = BENCH_DM_EXP_MIN; exp <= BENCH_DM_EXP_MAX; exp++) { uint64_t len = 1 << exp; uint64_t start, end, total = 0; uint64_t runs = (BENCH_RUNS << 2) >> exp; for (int i = 0; i < runs + BENCH_WARMUP; i++) { start = rdtscp(); for (int j = 0; j < len / 4; j++) // mapi[j] = j; // write dummy += mapi[j]; // read end = rdtscp(); if (i > BENCH_WARMUP) total += end - start; } info("exp = %u\truns = %ju\ttotal = %ju\tavg = %ju\tavgw = %ju", exp, runs, total, total / runs, total / (runs * len)); fprintf(g, "%zu %lu %ju\n", len, total / runs, runs); } fclose(g); return 0; }