diff --git a/arch/x86/include/asm/icc.h b/arch/x86/include/asm/icc.h index 9cca3390..33316d2f 100644 --- a/arch/x86/include/asm/icc.h +++ b/arch/x86/include/asm/icc.h @@ -40,7 +40,7 @@ typedef struct { extern bootinfo_t* bootinfo; #define ICC_TAG_IP 0 -#define ICC_TAG_SVM 1 +#define ICC_TAG_SVMREQUEST 1 #define ICC_TAG_PINGREQUEST 2 #define ICC_TAG_PINGRESPONSE 3 diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 0874259a..5d35ac53 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -41,8 +41,8 @@ #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ -#define _PAGE_BIT_RESERVED 9 /* mark a virtual address range as reserved */ -#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ +#define _PAGE_BIT_SVM_STRONG 9 /* mark a virtual address range as used by the SVM system */ +#define _PAGE_BIT_SVM_LAZYRELEASE 10 /* mark a virtual address range as used by the SVM system */ /// Page is present #define PG_PRESENT (1 << _PAGE_BIT_PRESENT) @@ -64,12 +64,12 @@ #define PG_MPE PG_PSE /// Global TLB entry (Pentium Pro and later) #define PG_GLOBAL (1 << _PAGE_BIT_GLOBAL) -/// This virtual address range is reserved as marked -#define PG_RESERVED (1 << _PAGE_BIT_RESERVED) /// Pattern flag #define PG_PAT (1 << _PAGE_BIT_PAT) -/// Large page pattern flag -#define PG_PAT_LARGE (1 << _PAGE_BIT_PAT_LARGE) +/// This virtual address range is used by SVM system as marked +#define PG_SVM_STRONG (1 << _PAGE_BIT_SVM_STRONG) +/// This virtual address range is used by SVM system as marked +#define PG_SVM_LAZYRELEASE (1 << _PAGE_BIT_SVM_LAZYRELEASE) /// This is a whole set of flags (PRESENT,RW,ACCESSED,DIRTY) for kernelspace tables #define KERN_TABLE (PG_PRESENT|PG_RW|PG_ACCESSED|PG_DIRTY) @@ -152,7 +152,7 @@ int unmap_region(size_t viraddr, uint32_t npages); * * @param viraddr Desired virtual address * @param phyaddr Physical address to map from - * @param npages The Region's size in pages + * @param npages The region's size in number of pages * @param flags Further page flags * * @return diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index da3b3556..57e1b228 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -106,7 +106,7 @@ inline static void flush_cache(void) { * The invd asm instruction which invalidates cache without writing back * is used here */ -inline static void invalid_cache(void) { +inline static void invalidate_cache(void) { asm volatile ("invd"); } @@ -272,6 +272,22 @@ static inline uint32_t read_eflags(void) return result; } +/** @brief search the first bit, which is set + * + * @param i source operand + * @return first bit, which is set in the source operand + */ +static inline uint32_t last_set(uint32_t i) +{ + uint32_t ret; + + if (!i) + return 0; + asm volatile ("bsr %1, %0" : "=r"(ret) : "r"(i) : "flags"); + + return ret; +} + /** @brief Read extended instruction pointer * @return The EIP's value */ diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h index 51d288a5..fd6480e0 100644 --- a/arch/x86/include/asm/string.h +++ b/arch/x86/include/asm/string.h @@ -35,6 +35,59 @@ void copy_page_physical(void* dest, const void * src); #ifdef HAVE_ARCH_MEMCPY +#ifdef CONFIG_ROCKCREEK +/** @brief Fast procedure to get a byte range from RAM into on-die memory. + * + * A write access, which cache line is not present, doesn't perform (on the + * current SCC architecture) a cache line fill. Therefore, the core writes + * in this case directly to the memory. + * + * The following function copies by prefetching its destintation. Therefore, + * the function avoids the bad behavior of a "write miss". + * + * @param dest Destination address + * @param src Source address + * @param count Range size in bytes + */ +inline static void *memcpy(void *dest, const void *src, size_t count) +{ + int32_t h, i, j, k, l, m; + + asm volatile ("cld;\n\t" + "1: cmpl $0, %%eax ; je 3f\n\t" + "movl (%%esi), %%ecx\n\t" + "movl (%%edi), %%edx\n\t" + "cmpl $1, %%eax ; je 2f\n\t" + "movl 32(%%esi), %%ecx\n\t" + "movl 32(%%edi), %%edx\n\t" + "2: movl 0(%%esi), %%ecx\n\t" + "movl 4(%%esi), %%edx\n\t" + "movl %%ecx, 0(%%edi)\n\t" + "movl %%edx, 4(%%edi)\n\t" + "movl 8(%%esi), %%ecx\n\t" + "movl 12(%%esi), %%edx\n\t" + "movl %%ecx, 8(%%edi)\n\t" + "movl %%edx, 12(%%edi)\n\t" + "movl 16(%%esi), %%ecx\n\t" + "movl 20(%%esi), %%edx\n\t" + "movl %%ecx, 16(%%edi)\n\t" + "movl %%edx, 20(%%edi)\n\t" + "movl 24(%%esi), %%ecx\n\t" + "movl 28(%%esi), %%edx\n\t" + "movl %%ecx, 24(%%edi)\n\t" + "movl %%edx, 28(%%edi)\n\t" + "addl $32, %%esi\n\t" + "addl $32, %%edi\n\t" + "dec %%eax ; jmp 1b\n\t" + "3: movl %%ebx, %%ecx\n\t" + "movl (%%edi), %%edx\n\t" + "andl $31, %%ecx\n\t" + "rep ; movsb\n\t":"=&a" (h), "=&D"(i), "=&S"(j), "=&b"(k), "=&c"(l), "=&d"(m) + : "0"(count / 32), "1"(dest), "2"(src), "3"(count) : "memory"); + + return dest; +} +#else /** @brief Copy a byte range from source to dest * * @param dest Destination address @@ -60,6 +113,8 @@ inline static void *memcpy(void* dest, const void *src, size_t count) } #endif +#endif + #ifdef HAVE_ARCH_MEMSET /** @brief Repeated write of a value to a whole range of bytes diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h new file mode 100644 index 00000000..301fda74 --- /dev/null +++ b/arch/x86/include/asm/svm.h @@ -0,0 +1,108 @@ +/* + * Copyright 2011 Stefan Lankes, Chair for Operating Systems, + * RWTH Aachen University + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of MetalSVM. + */ + +#ifndef __ARCH_SVM_H__ +#define __ARCH_SVM_H__ + +#include +#ifdef CONFIG_ROCKCREEK +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef CONFIG_ROCKCREEK + +#define SVM_STRONG (1 << 0) +#define SVM_LAZYRELEASE (1 << 1) + +/** @brief Init routine of the SVM subsystem + * + * @return + * - 0 on success + * - -ENOMEM not enough memory + */ +int svm_init(void); + +/** @brief Memory allocator of the SVM subsystem. + * + * Like RCCE function, belongs svmmalloc to the synchronous + * function. + * + * @return Pointer to the new memory range + */ +void* svmmalloc(size_t sizei, uint32_t flags); + +/** @brief Frees memory, which is managed by the SVM subsystem + * + * Like RCCE function, belongs svmfree to the synchronous function. + */ +void svmfree(void* addr, size_t size); + +/** @brief Request for exlusive access + * + * @return + * - 0 on success + */ +int svm_access_request(size_t addr); + +/** @brief emit page to core ue + * + * @return + * - 0 on success + */ +int svm_emit_page(size_t addr, int ue); + +/* @brief invalidate the cache entries for all SVM regions + */ +static inline void svm_invalidate(void) +{ + asm volatile ( ".byte 0x0f; .byte 0x0a;\n" ); // CL1FLUSHMB +} + +/* *brief flushs the cache for all SVM regions + */ +#ifdef CONFIG_ROCKCREEK +#ifndef SVM_WB +static inline void svm_flush(void) +{ + // need to write to another line to make sure the write combine buffer gets flushed + *(int *)RCCE_fool_write_combine_buffer = 1; +} +#else +void svm_flush(void); +#endif +#endif + +/* @brief dumps the some performance counters (e.g. numbers of page migrations) + * + * @retrun + * - 0 on success + */ +int svm_statistics(void); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 379a181b..da4c65f4 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -60,7 +60,7 @@ static uint32_t ncores = 1; static uint8_t irq_redirect[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF}; #if MAX_CORES > 1 static uint8_t boot_code[] = { 0xFA, 0x0F, 0x01, 0x16, 0x3B, 0x70, 0x0F, 0x20, 0xC0, 0x0C, 0x01, 0x0F, 0x22, 0xC0, 0x66, 0xEA, 0x16, 0x70, 0x00, 0x00, 0x08, 0x00, 0x31, 0xC0, 0x66, 0xB8, 0x10, 0x00, 0x8E, 0xD8, 0x8E, 0xC0, 0x8E, 0xE0, 0x8E, 0xE8, 0x8E, 0xD0, 0xBC, 0xEF, 0xBE, 0xAD, 0xDE, 0x68, 0xAD, 0xDE, 0xAD, 0xDE, 0x6A, 0x00, 0xEA, 0xDE, 0xC0, 0xAD, 0xDE, 0x08, 0x00, 0xEB, 0xFE, 0x17, 0x00, 0x41, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x9A, 0xCF, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x92, 0xCF, 0x00}; -static atomic_int32_t cpu_online = ATOMIC_INIT(1); +atomic_int32_t cpu_online = ATOMIC_INIT(1); #endif static uint8_t initialized = 0; spinlock_t bootlock = SPINLOCK_INIT; diff --git a/arch/x86/kernel/timer.c b/arch/x86/kernel/timer.c index 8a806eb6..af70abfd 100644 --- a/arch/x86/kernel/timer.c +++ b/arch/x86/kernel/timer.c @@ -36,6 +36,10 @@ */ static volatile uint64_t timer_ticks = 0; +#if MAX_CORES > 1 +extern atomic_int32_t cpu_online; +#endif + uint64_t get_clock_tick(void) { return timer_ticks; @@ -61,8 +65,6 @@ int sys_times(struct tms* buffer, clock_t* clock) */ static void timer_handler(struct state *s) { - uint32_t i; - /* Increment our 'tick counter' */ #if MAX_CORES > 1 if (smp_id() == 0) @@ -78,6 +80,13 @@ static void timer_handler(struct state *s) vga_puts("One second has passed\n"); }*/ } + + update_load(); + +#if MAX_CORES > 1 + if ((atomic_int32_read(&cpu_online) > 1) && (timer_ticks % (TIMER_FREQ/5) == 0)) + load_balancing(); +#endif } int timer_wait(unsigned int ticks) @@ -104,12 +113,7 @@ int timer_wait(unsigned int ticks) check_workqueues(); if (timer_ticks < eticks) { - uint32_t flags = irq_nested_disable(); - curr_task->timeout = eticks; - curr_task->flags |= TASK_TIMER_USED; - curr_task->status = TASK_BLOCKED; - irq_nested_enable(flags); - + set_timer(eticks); reschedule(); } } diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 2f9b1834..09392539 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,4 +1,4 @@ -C_source := page.c +C_source := page.c svm.c MODULE := arch_x86_mm include $(TOPDIR)/Makefile.inc diff --git a/arch/x86/mm/page.c b/arch/x86/mm/page.c index 6eadd3d1..cb22c6f3 100644 --- a/arch/x86/mm/page.c +++ b/arch/x86/mm/page.c @@ -34,6 +34,7 @@ #ifdef CONFIG_ROCKCREEK #include #include +#include #include #endif @@ -354,6 +355,17 @@ size_t map_region(size_t viraddr, size_t phyaddr, uint32_t npages, uint32_t flag if (flags & MAP_MPE) pgt->entries[index] |= PG_MPE; #endif + if (flags & MAP_SVM_STRONG) +#ifndef SVM_WB + pgt->entries[index] |= PG_SVM_STRONG|PG_PWT; +#else + pgt->entries[index] |= PG_SVM; +#endif + if (flags & MAP_SVM_LAZYRELEASE) + pgt->entries[index] |= PG_SVM_LAZYRELEASE|PG_PWT; + + if (flags & MAP_NO_ACCESS) + pgt->entries[index] &= ~PG_PRESENT; if (flags & MAP_USER_SPACE) atomic_int32_inc(&task->user_usage); @@ -395,6 +407,11 @@ int change_page_permissions(size_t start, size_t end, uint32_t flags) phyaddr = pgt->entries[index2] & 0xFFFFF000; newflags = pgt->entries[index2] & 0xFFF; // get old flags + if ((newflags & PG_SVM_STRONG) && !(newflags & PG_PRESENT) && (flags & (VMA_READ|VMA_WRITE) && !(flags & VMA_NOACCESS))) + newflags |= PG_PRESENT; + else if ((newflags & PG_SVM_STRONG) && (newflags & PG_PRESENT) && (flags & VMA_NOACCESS)) + newflags &= ~PG_PRESENT; + // update flags if (!(flags & VMA_WRITE)) newflags &= ~PG_RW; @@ -591,8 +608,13 @@ int print_paging_tree(size_t viraddr) static void pagefault_handler(struct state *s) { task_t* task = per_core(current_task); + page_dir_t* pgd = task->pgd; + page_table_t* pgt = NULL; size_t viraddr = read_cr2(); size_t phyaddr; +#ifdef CONFIG_ROCKCREEK + uint32_t index1, index2; +#endif if ((viraddr >= task->start_heap) && (viraddr <= task->end_heap) && (viraddr > KERNEL_SPACE)) { viraddr = viraddr & 0xFFFFF000; @@ -610,6 +632,20 @@ static void pagefault_handler(struct state *s) put_page(phyaddr); } +#ifdef CONFIG_ROCKCREEK + // does our SVM system need to handle this page fault? + index1 = viraddr >> 22; + index2 = (viraddr >> 12) & 0x3FF; + if (!pgd || !(pgd->entries[index1] & 0xFFFFF000)) + goto default_handler; + pgt = (page_table_t*) ((KERNEL_SPACE - 1024*PAGE_SIZE + index1*PAGE_SIZE) & 0xFFFFF000); + if (!pgt || !(pgt->entries[index2])) + goto default_handler; + if (pgt->entries[index2] & PG_SVM_STRONG) + if (!svm_access_request(viraddr)) + return; +#endif + default_handler: kprintf("PAGE FAULT: Task %u got page fault at %p (irq %d, cs:eip 0x%x:0x%x)\n", task->id, viraddr, s->int_no, s->cs, s->eip); kprintf("Register state: eax = 0x%x, ebx = 0x%x, ecx = 0x%x, edx = 0x%x, edi = 0x%x, esi = 0x%x, ebp = 0x%x, esp = 0x%x\n", diff --git a/arch/x86/mm/svm.c b/arch/x86/mm/svm.c new file mode 100644 index 00000000..1dd2075f --- /dev/null +++ b/arch/x86/mm/svm.c @@ -0,0 +1,293 @@ +/* + * Copyright 2011 Stefan Lankes, Chair for Operating Systems, + * RWTH Aachen University + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * This file is part of MetalSVM. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_ROCKCREEK +#include +#include +#include +#include +#include +#include + +#define SHARED_PAGES (RCCE_SHM_SIZE_MAX >> PAGE_SHIFT) +#define OWNER_SIZE ((SHARED_PAGES * sizeof(uint8_t) + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) + +t_vcharp RC_SHM_BUFFER_START(); + +/* + * This array describes the owner of a specific page. + * Only the owner of a page is able to change the possession. + * => No lock is needded. + */ +static volatile uint8_t* page_owner = NULL; + +// helper array to convert a physical to a virtual address +static size_t phys2virt[SHARED_PAGES] = {[0 ... SHARED_PAGES-1] = 0}; +static size_t shmbegin = 0; +static int my_ue = 0; +static uint32_t emit[RCCE_MAXNP] = {[0 ... RCCE_MAXNP-1] = 0}; +static uint32_t request[RCCE_MAXNP] = {[0 ... RCCE_MAXNP-1] = 0}; +static uint32_t forward[RCCE_MAXNP] = {[0 ... RCCE_MAXNP-1] = 0}; + +int svm_init(void) +{ + size_t phyaddr; + uint32_t flags; + + // iRCCE is not thread save => disable interrupts + flags = irq_nested_disable(); + my_ue = RCCE_ue(); + shmbegin = (size_t)RC_SHM_BUFFER_START(); + phyaddr = (size_t) RCCE_shmalloc(OWNER_SIZE); + irq_nested_enable(flags); + + if (BUILTIN_EXPECT(!phyaddr, 0)) + return -ENOMEM; + if (BUILTIN_EXPECT(phyaddr & 0xFFF, 0)) { + kprintf("RCCE_shmalloc returns not a page aligned physiacl address: 0x%x\n", phyaddr); + return -ENOMEM; + } + + kprintf("Shared memory starts at the physical address 0x%x\n", shmbegin); + + page_owner = (uint8_t*) map_region(0, phyaddr, OWNER_SIZE >> PAGE_SHIFT, MAP_KERNEL_SPACE|MAP_NO_CACHE); + if (BUILTIN_EXPECT(!page_owner, 0)) { + flags = irq_nested_disable(); + RCCE_shfree((t_vcharp) phyaddr); + irq_nested_enable(flags); + return -ENOMEM; + } + + // per default is core 0 owner + if (!my_ue) + memset((void*)page_owner, 0x00, OWNER_SIZE); + + // iRCCE is not thread save => disable interrupts + flags = irq_nested_disable(); + RCCE_barrier(&RCCE_COMM_WORLD); + irq_nested_enable(flags); + + return 0; +} + +/* + * This function is called by the pagefault handler + * => the interrupt flags is already cleared + */ +int svm_access_request(size_t addr) +{ + size_t phyaddr = virt_to_phys(addr); + uint32_t pageid; + int remote_rank; + uint8_t payload[iRCCE_MAIL_HEADER_PAYLOAD]; + + if (phyaddr < shmbegin) + return -EINVAL; + if (phyaddr >= shmbegin + RCCE_SHM_SIZE_MAX) + return -EINVAL; + pageid = (phyaddr-shmbegin) >> PAGE_SHIFT; + + //svm_flush(); + if (page_owner[pageid] == my_ue) + return 0; + + remote_rank = page_owner[pageid]; + ((size_t*) payload)[0] = my_ue; + ((size_t*) payload)[1] = phyaddr; + + //kprintf("send access request to %d of 0x%x\n", remote_rank, phyaddr); + /* send ping request */ + iRCCE_mail_send(2*sizeof(size_t), ICC_TAG_SVMREQUEST, 0, payload, remote_rank); + + request[remote_rank]++; + NOP8; + icc_send_irq(remote_rank); + + /* check for incoming messages */ + icc_mail_check(); + + while (page_owner[pageid] != my_ue) { + NOP4; + } + + return change_page_permissions(addr, addr+PAGE_SIZE, VMA_READ|VMA_WRITE|VMA_CACHEABLE); +} + +void* svmmalloc(size_t size, uint32_t consistency) +{ + size_t phyaddr, viraddr, i; + uint32_t flags; + uint32_t map_flags = MAP_KERNEL_SPACE|MAP_MPE; + + if (consistency & SVM_STRONG) + map_flags |= MAP_SVM_STRONG; + else if (consistency & SVM_LAZYRELEASE) + map_flags |= MAP_SVM_LAZYRELEASE; + else return 0; + + // currently, we allocate memory in page size granulation + size = (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); + + // iRCCE is not thread save => disable interrupts + flags = irq_nested_disable(); + phyaddr = (size_t) RCCE_shmalloc(size); + + if (RCCE_ue() && (consistency & SVM_STRONG)) + map_flags |= MAP_NO_ACCESS; + irq_nested_enable(flags); + + if (BUILTIN_EXPECT(!phyaddr, 0)) + return NULL; + if (BUILTIN_EXPECT(phyaddr & 0xFFF, 0)) { + kprintf("RCCE_shmalloc returns not a page aligned physiacl address: 0x%x\n", phyaddr); + return NULL; + } + + viraddr = map_region(0, phyaddr, size >> PAGE_SHIFT, map_flags); + for(i=0; i> PAGE_SHIFT] = viraddr + i; + + kprintf("svmmalloc: phyaddr 0x%x, viraddr 0x%x, size 0x%x\n", phyaddr, viraddr, size); + + return (void*) viraddr; +} + +void svmfree(void* addr, size_t size) +{ + size_t phyaddr, i; + uint32_t flags; + + if (BUILTIN_EXPECT(!addr || !size, 0)) + return; + + phyaddr = virt_to_phys((size_t) addr); + + // currently, we allocate memory in page size granulation + size = (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); + + kprintf("svmfree: phyaddr 0x%x, viraddr 0x%x, size 0x%x\n", phyaddr, addr, size); + + unmap_region((size_t) addr, size >> PAGE_SHIFT); + for(i=0; i> PAGE_SHIFT] = 0; + + // iRCCE is not thread save => disable interrupts + flags = irq_nested_disable(); + RCCE_shfree((t_vcharp) phyaddr); + irq_nested_enable(flags); +} + +/* + * This function is called by icc_mail_check. + * => Interrupt flag is alread cleared. + */ +int svm_emit_page(size_t phyaddr, int ue) +{ + uint32_t pageid; + + //kprintf("Try to emit page 0x%x to %d\n", phyaddr, ue); + + if (phyaddr < shmbegin) + return -EINVAL; + if (phyaddr >= shmbegin + RCCE_SHM_SIZE_MAX) + return -EINVAL; + pageid = (phyaddr-shmbegin) >> PAGE_SHIFT; + + if (page_owner[pageid] != my_ue) { + // Core is nor owner => forward request to new owner + int remote_rank; + uint8_t payload[iRCCE_MAIL_HEADER_PAYLOAD]; + + kprintf("Ups, core %d is not owner of page 0x%x\n", my_ue, phyaddr); + + remote_rank = page_owner[pageid]; + ((size_t*) payload)[0] = ue; + ((size_t*) payload)[1] = phyaddr; + + /* send ping request */ + iRCCE_mail_send(2*sizeof(size_t), ICC_TAG_SVMREQUEST, 0, payload, remote_rank); + + NOP8; + icc_send_irq(remote_rank); + + forward[remote_rank]++; + } else { + size_t viraddr; + + svm_flush(); + page_owner[pageid] = ue; + + emit[ue]++; + viraddr = phys2virt[(phyaddr - shmbegin) >> PAGE_SHIFT]; + change_page_permissions(viraddr, viraddr+PAGE_SIZE, VMA_NOACCESS|VMA_READ|VMA_CACHEABLE); + } + + return 0; +} + +#ifdef SVM_WB +void svm_flush(void) +{ + int z, tmp; + + // need to write to another line to make sure the write combine buffer gets flushed + *(int *)RCCE_fool_write_combine_buffer = 1; + flush_cache(); + +#if 0 + // try to flush L2 cache + z = Z_PID(RC_COREID[my_ue]); + tmp=ReadConfigReg(CRB_OWN + (z==0 ? GLCFG0 : GLCFG1)); + tmp &= ~(1 << GLCFG_XFLSHNN_BIT); + SetConfigReg(CRB_OWN + (z==0 ? GLCFG0 : GLCFG1), tmp); + + while(!(ReadConfigReg(CRB_OWN + (z==0 ? GLCFG0 : GLCFG1)) & (1 << GLCFG_XFLSHNN_BIT))) { + NOP8; + } +#endif +} +#endif + +int svm_statistics(void) +{ + uint32_t i; + + kprintf("emit\t:"); + for(i=0; i -#ifdef COPPERRIDGE +#if defined(COPPERRIDGE) || defined(SCC) #include "scc_memcpy.h" #endif void* iRCCE_memcpy_get(void *dest, const void *src, size_t count) { -#ifdef COPPERRIDGE - return memcpy_from_mpb(dest, src, count); +#if defined(COPPERRIDGE) || defined(SCC) + return memcpy_get(dest, src, count); #else return memcpy(dest, src, count); #endif diff --git a/arch/x86/scc/iRCCE_put.c b/arch/x86/scc/iRCCE_put.c index 93cea070..96060cae 100644 --- a/arch/x86/scc/iRCCE_put.c +++ b/arch/x86/scc/iRCCE_put.c @@ -35,14 +35,14 @@ #include -#ifdef COPPERRIDGE +#if defined(COPPERRIDGE) || defined(SCC) #include "scc_memcpy.h" #endif void* iRCCE_memcpy_put(void *dest, const void *src, size_t count) { -#ifdef COPPERRIDGE - return memcpy_to_mpb(dest, src, count); +#if defined(COPPERRIDGE) || defined(SCC) + return memcpy_put(dest, src, count); #else return memcpy(dest, src, count); #endif diff --git a/arch/x86/scc/icc.c b/arch/x86/scc/icc.c index 75bfdc78..1d3b4df7 100644 --- a/arch/x86/scc/icc.c +++ b/arch/x86/scc/icc.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -131,7 +132,7 @@ int icc_init(void) return -ENODEV; // enable additional outputs - RCCE_debug_set(RCCE_DEBUG_ALL); + //RCCE_debug_set(RCCE_DEBUG_ALL); my_ue = RCCE_ue(); num_ues = RCCE_num_ues(); @@ -163,9 +164,18 @@ int icc_init(void) // reset INTR/LINT0 flag z = Z_PID(RC_COREID[my_ue]); tmp=ReadConfigReg(CRB_OWN + (z==0 ? GLCFG0 : GLCFG1)); - tmp &= ~2; + tmp &= ~(1 << GLCFG_XINTR_BIT); SetConfigReg(CRB_OWN + (z==0 ? GLCFG0 : GLCFG1), tmp); +#if 0 + // disable L2 cache + z = Z_PID(RC_COREID[my_ue]); + tmp=ReadConfigReg(CRB_OWN + (z==0 ? L2CFG0 : L2CFG1)); + tmp |= (1 << L2CFG_WAYDISABLE_BIT); + SetConfigReg(CRB_OWN + (z==0 ? L2CFG0 : L2CFG1), tmp); + kprintf("set L2CFG to 0x%x\n", (uint32_t) tmp); +#endif + // set interrupt handler (INTR/LINT0) irq_install_handler(124, intr_handler); @@ -262,12 +272,13 @@ int icc_mail_ping( void ) void icc_mail_check(void) { iRCCE_MAIL_HEADER* header = NULL; - int res; uint64_t timer; //char* recv_buffer; // empty mailbox and interpret headers - while( (res = iRCCE_mail_recv( &header )) == iRCCE_SUCCESS ) { + while( iRCCE_mail_recv( &header ) == iRCCE_SUCCESS ) { + //iRCCE_mailbox_print_header(header); + switch(header->tag) { case ICC_TAG_PINGREQUEST: @@ -279,6 +290,9 @@ void icc_mail_check(void) timer = rdtsc() - *((uint64_t*) header->payload); kprintf( "Response received in %d ticks!\n", timer ); break; + case ICC_TAG_SVMREQUEST: + svm_emit_page(((size_t*) header->payload)[1], ((size_t*) header->payload)[0]); + break; default: kprintf("Invalid mail: tag = %d\n", header->tag); break; diff --git a/arch/x86/scc/scc_memcpy.h b/arch/x86/scc/scc_memcpy.h index 31eb47c1..b4d9a0ba 100644 --- a/arch/x86/scc/scc_memcpy.h +++ b/arch/x86/scc/scc_memcpy.h @@ -17,25 +17,10 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -/** - * @author Stefan Lankey, Carsten Clauss - * @file arch/x86/scc/scc_memcpy.h - * @brief Special memcpy related implementations for the Intel SCC - * - * This file contains special SCC-efficient memcpy implementations - * to get memory from the RAM into the on-die memory or from the - * on-die memory into the RAM. - */ - #ifndef __SCC_MEMCPY_H_ #define __SCC_MEMPCY_H_ -#include - -#ifdef CONFIG_ROCKCREEK - -/** @brief Fast procedure to get a byte range from RAM into on-die memory. - * +/* * A write access, which cache line is not present, doesn't perform (on the * current SCC architecture) a cache line fill. Therefore, the core writes * in this case directly to the memory. @@ -43,14 +28,10 @@ * The following function copies from the on-die memory (MPB) to the off-die * memory and prefetchs its destintation. Therefore, the function avoids the * bad behavior of a "write miss". - * - * @param dest Destination address - * @param src Source address - * @param count Range size in bytes */ inline static void *memcpy_get(void *dest, const void *src, size_t count) { - int32_t h, i, j, k, l, m; + int h, i, j, k, l, m; asm volatile ("cld;\n\t" "1: cmpl $0, %%eax ; je 2f\n\t" @@ -85,19 +66,36 @@ inline static void *memcpy_get(void *dest, const void *src, size_t count) return dest; } +#if 1 +/* + * In our kernel, we didn't want to use FPU registers. + * Therefore, we use standard memcpy routine + */ +inline static void *memcpy_put(void* dest, const void *src, size_t count) +{ + int32_t i, j, k; -/** @brief Fast procedure to get a byte range from on-die memory into RAM. - * + if (BUILTIN_EXPECT(!dest || !src, 0)) + return dest; + + asm volatile ( + "cld; rep movsl\n\t" + "movl %4, %%ecx\n\t" + "andl $3, %%ecx\n\t" + "rep movsb\n\t" + : "=&c"(i), "=&D"(j), "=&S"(k) + : "0"(count/4), "g"(count), "1"(dest), "2"(src) : "memory"); + + return dest; +} +#else +/* * If the destination is located on on-die memory (MPB), classical prefetching * techniques will be used to increase the performance. - * - * @param dest Destination address - * @param src Source address - * @param count range size in bytes */ inline static void *memcpy_put(void *dest, const void *src, size_t count) { - int32_t i, j, k, l; + int i, j, k, l; /* * We use the floating point registers to @@ -166,7 +164,6 @@ inline static void *memcpy_put(void *dest, const void *src, size_t count) return dest; } - #endif #endif diff --git a/drivers/net/mmnif.c b/drivers/net/mmnif.c index 6d805772..1cb8b779 100644 --- a/drivers/net/mmnif.c +++ b/drivers/net/mmnif.c @@ -47,6 +47,7 @@ extern HANDLE hProc; #include #include +#include #include #include @@ -715,7 +716,11 @@ err_t mmnif_init(struct netif* netif) /* Alloc and clear shared memory for rx_buff */ mpb_size = (sizeof(mm_rx_buffer_t) + MMNIF_RX_BUFFERLEN); + // align mpb size to the granularity of a page size + mpb_size = (mpb_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); mpb_start_address = RCCE_shmalloc(mpb_size*MMNIF_CORES); + // map physical address in the virtual address space + mpb_start_address = map_region(0, mpb_start_address, mpb_size >> PAGE_SHIFT, MAP_KERNEL_SPACE|MAP_NO_CACHE); mmnif->rx_buff = mpb_start_address + (mpb_size) * (own_ip_address - router_ip_address); if (!(mpb_start_address)) @@ -1103,6 +1108,8 @@ int mmnif_open(void) */ int mmnif_close(void) { + size_t phyaddr; + mmnif_t* mmnif; if (!mmnif_dev) @@ -1119,7 +1126,12 @@ int mmnif_close(void) kfree(mmnif->tx_buff[0],MMNIF_TX_QUEUELEN * MMNIF_TX_BUFFERLEN); kfree(mmnif_dev,sizeof(mmnif_t)); - RCCE_shfree(mpb_start_address); + + // determine physical address + phyaddr = virt_to_phys(mpb_start_address); + // unmap shared memory regeion + unmap_region(mpb_start_address, mpb_size >> PAGE_SHIFT); + RCCE_shfree(phyaddr); return NULL; } diff --git a/drivers/net/rckemac.c b/drivers/net/rckemac.c index 1c8ae09d..2769b909 100644 --- a/drivers/net/rckemac.c +++ b/drivers/net/rckemac.c @@ -261,6 +261,7 @@ again: static void rckemacif_input(struct netif* netif, struct pbuf* p) { struct eth_hdr *ethhdr; + err_t err; /* points to packet payload, which starts with an Ethernet header */ ethhdr = p->payload; @@ -275,8 +276,8 @@ static void rckemacif_input(struct netif* netif, struct pbuf* p) case ETHTYPE_PPPOE: #endif /* PPPOE_SUPPORT */ /* full packet send to tcpip_thread to process */ - if (mynetif->input(p, mynetif) != ERR_OK) { - LWIP_DEBUGF(NETIF_DEBUG, ("rckemacif_input: IP input error\n")); + if ((err = mynetif->input(p, mynetif)) != ERR_OK) { + LWIP_DEBUGF(NETIF_DEBUG, ("rckemacif_input: IP input error %u\n", err)); pbuf_free(p); } break; diff --git a/drivers/stdin/stdin.c b/drivers/stdin/stdin.c index 58005b09..27119b6e 100644 --- a/drivers/stdin/stdin.c +++ b/drivers/stdin/stdin.c @@ -35,7 +35,8 @@ static ssize_t stdin_read(fildes_t* file, uint8_t* buffer, size_t size) kb_buffer.maxsize = size; kb_buffer.size = 0; kb_buffer.tid = per_core(current_task)->id; - per_core(current_task)->status = TASK_BLOCKED; + block_current_task(); + //per_core(current_task)->status = TASK_BLOCKED; reschedule(); size = kb_buffer.size; diff --git a/include/metalsvm/config.h.example b/include/metalsvm/config.h.example index bd4c0b73..90f8b7c8 100644 --- a/include/metalsvm/config.h.example +++ b/include/metalsvm/config.h.example @@ -60,6 +60,7 @@ extern "C" { // RCCE specific flags #define SCC +#define COPPERRIDGE #define MS_BAREMETAL //#define GORY #define SHMADD diff --git a/include/metalsvm/fs.h b/include/metalsvm/fs.h index 855dc830..cdec219a 100644 --- a/include/metalsvm/fs.h +++ b/include/metalsvm/fs.h @@ -109,11 +109,11 @@ typedef struct block_list { } block_list_t; typedef struct vfs_node { - /// The permissions mask. + /// The permissions mask. uint32_t mask; - /// The owning user. + /// The owning user. uint32_t uid; - /// The owning group. + /// The owning group. uint32_t gid; /// Includes the node type. See #defines above. uint32_t type; diff --git a/include/metalsvm/semaphore.h b/include/metalsvm/semaphore.h index 19ad8e38..c0dc0f81 100644 --- a/include/metalsvm/semaphore.h +++ b/include/metalsvm/semaphore.h @@ -124,7 +124,7 @@ next_try1: } else { s->queue[s->pos] = curr_task->id; s->pos = (s->pos + 1) % MAX_TASKS; - curr_task->status = TASK_BLOCKED; + block_current_task(); spinlock_irqsave_unlock(&s->lock); reschedule(); NOP2; @@ -152,11 +152,10 @@ next_try2: } s->queue[s->pos] = curr_task->id; s->pos = (s->pos + 1) % MAX_TASKS; - curr_task->timeout = deadline; - curr_task->flags |= TASK_TIMER_USED; - curr_task->status = TASK_BLOCKED; + set_timer(deadline); spinlock_irqsave_unlock(&s->lock); reschedule(); + NOP2; goto next_try2; } } diff --git a/include/metalsvm/stdlib.h b/include/metalsvm/stdlib.h index ad1cecdb..15d0961e 100644 --- a/include/metalsvm/stdlib.h +++ b/include/metalsvm/stdlib.h @@ -48,6 +48,10 @@ extern "C" { #ifdef CONFIG_ROCKCREEK #define MAP_MPE (1 << 8) #endif +#define MAP_SVM_STRONG (1 << 9) +#define MAP_SVM_LAZYRELEASE (1 << 10) +#define MAP_NO_ACCESS (1 << 11) + void NORETURN abort(void); /** @brief Kernel's memory allocator function. diff --git a/include/metalsvm/tasks.h b/include/metalsvm/tasks.h index 0b0bcdac..be3fdd07 100644 --- a/include/metalsvm/tasks.h +++ b/include/metalsvm/tasks.h @@ -65,7 +65,7 @@ int multitasking_init(void); * - 0 on success * - -EINVAL (-22) on failure */ -int create_kernel_task(tid_t* id, entry_point_t ep, void* arg); +int create_kernel_task(tid_t* id, entry_point_t ep, void* arg, uint8_t prio); /** @brief Create a user level task. * @@ -85,6 +85,22 @@ int create_user_task(tid_t* id, const char* fame, char** argv); */ tid_t wait(int32_t* result); +/** @brief Update the load of the current core + * + * This function is called from the timer interrupt + * and updates the load of the current core + */ +void update_load(void); + +#if MAX_CORES > 1 +/** @brief Load balancer + * + * This load balancer is called from the timer interrupt + * and steals tasks from other cores + */ +void load_balancing(void); +#endif + /** @brief Task switcher * * Timer-interrupted use of this function for task switching */ @@ -100,6 +116,25 @@ void scheduler(void); */ int wakeup_task(tid_t); +/** @brief Block current task + * + * The current task's status will be changed to TASK_BLOCKED + * + * @return + * - 0 on success + * - -EINVAL (-22) on failure + */ +int block_current_task(void); + +/** @brief Block current task until timer expires + * + * @param deadline Clock tick, when the timer expires + * @return + * - 0 on success + * - -EINVAL (-22) on failure + */ +int set_timer(uint64_t deadline); + /** @brief Abort current task */ void NORETURN abort(void); diff --git a/include/metalsvm/tasks_types.h b/include/metalsvm/tasks_types.h index 0806ab50..7bc03f11 100644 --- a/include/metalsvm/tasks_types.h +++ b/include/metalsvm/tasks_types.h @@ -41,6 +41,13 @@ extern "C" { #endif +#define MAX_PRIO 31 +#define REALTIME_PRIO 31 +#define HIGH_PRIO 16 +#define NORMAL_PRIO 8 +#define LOW_PRIO 1 +#define IDLE_PRIO 0 + #define TASK_INVALID 0 #define TASK_READY 1 #define TASK_RUNNING 2 @@ -51,8 +58,6 @@ extern "C" { #define TASK_DEFAULT_FLAGS 0 #define TASK_FPU_INIT (1 << 0) #define TASK_FPU_USED (1 << 1) -#define TASK_TIMER_USED (1 << 2) -#define TASK_SWITCH_IN_PROGRESS (1 << 3) typedef int (*entry_point_t)(void*); typedef int (STDCALL *internal_entry_point_t)(void*); @@ -65,11 +70,17 @@ typedef struct task { /// Task status (INVALID, READY, RUNNING, ...) uint32_t status; /// Additional status flags. For instance, to signalize the using of the FPU - uint32_t flags; - /// Number of used time slices - uint32_t time_slices; + uint8_t flags; + /// Task priority + uint8_t prio; /// timeout for a blocked task uint64_t timeout; + /// next task in the queue + struct task* next; + /// previous task in the queue + struct task* prev; + /// last core id on which the task was running + uint32_t last_core; /// Usage in number of pages atomic_int32_t user_usage; /// Avoids concurrent access to the page directory @@ -85,13 +96,11 @@ typedef struct task { /// starting time/tick of the task uint64_t start_tick; /// Start address of the heap - uint32_t start_heap; + size_t start_heap; /// End address of the heap - uint32_t end_heap; -#ifdef CONFIG_LWIP + size_t end_heap; /// LwIP error code int lwip_err; -#endif /// Mail inbox mailbox_wait_msg_t inbox; /// Mail outbox array @@ -100,6 +109,34 @@ typedef struct task { union fpu_state fpu; } task_t; +typedef struct { + task_t* first; + task_t* last; +} task_list_t; + +typedef struct { + /// idle task + task_t* idle __attribute__ ((aligned (CACHE_LINE))); + /// previous task + task_t* old_task; + /// total number of tasks in the queue + uint32_t nr_tasks; + // current load = average number of tasks in the queue (1-minute average) + uint32_t load; + // help counter to determine the the cpu load + int32_t load_counter; + // help counter to avoid "over balancing" + int32_t balance_counter; + /// indicates the used priority queues + uint32_t prio_bitmap; + /// a queue for each priority + task_list_t queue[MAX_PRIO]; + /// a queue for timers + task_list_t timers; + /// lock for this runqueue + spinlock_t lock; +} runqueue_t; + #ifdef __cplusplus } #endif diff --git a/include/metalsvm/vma.h b/include/metalsvm/vma.h index 449e81da..74c63233 100644 --- a/include/metalsvm/vma.h +++ b/include/metalsvm/vma.h @@ -32,10 +32,11 @@ extern "C" { #endif -#define VMA_READ 0x01 -#define VMA_WRITE 0x02 -#define VMA_EXECUTE 0x04 -#define VMA_CACHEABLE 0x08 +#define VMA_READ (1 << 0) +#define VMA_WRITE (1 << 1) +#define VMA_EXECUTE (1 << 2) +#define VMA_CACHEABLE (1 << 3) +#define VMA_NOACCESS (1 << 4) struct vma; diff --git a/kernel/client.c b/kernel/client.c index 386e69a7..0c3982fb 100644 --- a/kernel/client.c +++ b/kernel/client.c @@ -42,7 +42,7 @@ int cli_ConnectTo(Client* cli,char * pAdresse,unsigned short Port,int webAdresse if (connect(cli->sSocket,(const struct sockaddr*)&cli->adAddr, sizeof(cli->adAddr))==0) { - create_kernel_task(&cli->bThread,cli_WaitForPacket,cli); + create_kernel_task(&cli->bThread,cli_WaitForPacket,cli, NORMAL_PRIO); if (cli->_OnConnect != 0) { diff --git a/kernel/main.c b/kernel/main.c index f89d2bc7..6a5b9e79 100644 --- a/kernel/main.c +++ b/kernel/main.c @@ -32,6 +32,7 @@ #include #ifdef CONFIG_ROCKCREEK #include +#include #endif /* @@ -75,6 +76,7 @@ int main(void) mmu_init(); #ifdef CONFIG_ROCKCREEK icc_init(); + svm_init(); #endif initrd_init(); @@ -89,8 +91,7 @@ int main(void) kprintf("Current available memory: %u MBytes\n", atomic_int32_read(&total_available_pages)/((1024*1024)/PAGE_SIZE)); sleep(5); - create_kernel_task(NULL, initd, NULL); - per_core(current_task)->time_slices = 0; // reset the number of time slices + create_kernel_task(NULL, initd, NULL, NORMAL_PRIO); reschedule(); while(1) { diff --git a/kernel/server.c b/kernel/server.c index bd5c441a..1d988eaa 100644 --- a/kernel/server.c +++ b/kernel/server.c @@ -78,7 +78,7 @@ void* srv_WaitForConnection(Server* srv) t = (ServerThreadArgs*) kmalloc(sizeof(ServerThreadArgs)); t->ID = i; t->srv = srv; - create_kernel_task(&srv->bThreads[i],srv_WaitForPacket,t); + create_kernel_task(&srv->bThreads[i],srv_WaitForPacket,t, NORMAL_PRIO); break; } @@ -175,7 +175,7 @@ int server_init(Server* srv, unsigned short Port, unsigned int dwMaxConnections) bind( srv->sSocket,(const struct sockaddr *) &srv->adAddr, sizeof(srv->adAddr)); // Der Server an die Adresse binden; listen(srv->sSocket,srv->dwMaximumConnections); // Den Server in listenig State versetzen - create_kernel_task(&srv->bThread_listen,srv_WaitForConnection,srv); + create_kernel_task(&srv->bThread_listen,srv_WaitForConnection,srv, NORMAL_PRIO); // sConnections[0] = accept(sSocket,(struct sockaddr*)&tmpAddr,&tmpAddrLen); // t.ID = 0; // bthread_create(&bThreads[0],NULL,(start_routine) srv_WaitForPacket,&t); diff --git a/kernel/tasks.c b/kernel/tasks.c index 1c07b40d..d3512e96 100644 --- a/kernel/tasks.c +++ b/kernel/tasks.c @@ -47,14 +47,20 @@ * A task's id will be its position in this array. */ static task_t task_table[MAX_TASKS] = { \ - [0] = {0, TASK_IDLE, 0, 0, 0, ATOMIC_INIT(0), SPINLOCK_INIT, NULL, SPINLOCK_INIT, NULL, FS_INIT, 0, 0, 0, 0}, \ - [1 ... MAX_TASKS-1] = {0, TASK_INVALID, 0, 0, 0, ATOMIC_INIT(0), SPINLOCK_INIT, NULL, SPINLOCK_INIT, NULL, FS_INIT, 0, 0, 0, 0}}; + [0] = {0, TASK_IDLE, 0, 0, 0, NULL, NULL, 0, ATOMIC_INIT(0), SPINLOCK_INIT, NULL, SPINLOCK_INIT, NULL, FS_INIT, 0, 0, 0, 0}, \ + [1 ... MAX_TASKS-1] = {0, TASK_INVALID, 0, 0, 0, NULL, NULL, 0, ATOMIC_INIT(0), SPINLOCK_INIT, NULL, SPINLOCK_INIT, NULL, FS_INIT, 0, 0, 0, 0}}; + static spinlock_irqsave_t table_lock = SPINLOCK_IRQSAVE_INIT; +#if MAX_CORES > 1 +static runqueue_t runqueues[MAX_CORES] = { \ + [0] = {task_table+0, NULL, 0, 0, 0, 0, 0, {[0 ... MAX_PRIO-1] = {NULL, NULL}}, {NULL, NULL}, SPINLOCK_INIT}, \ + [1 ... MAX_CORES-1] = {NULL, NULL, 0, 0, 0, 0, 0, {[0 ... MAX_PRIO-1] = {NULL, NULL}}, {NULL, NULL}, SPINLOCK_INIT}}; +#else +static runqueue_t runqueues[1] = { \ + [0] = {task_table+0, NULL, 0, 0, 0, 0, 0, {[0 ... MAX_PRIO-1] = {NULL, NULL}}, {NULL, NULL}, SPINLOCK_INIT}}; +#endif DEFINE_PER_CORE(task_t*, current_task, task_table+0); -#if MAX_CORES > 1 -DEFINE_PER_CORE_STATIC(task_t*, old_task, NULL); -#endif /** @brief helper function for the assembly code to determine the current task * @return Pointer to the task_t structure of current task @@ -63,24 +69,6 @@ task_t* get_current_task(void) { return per_core(current_task); } -int dump_scheduling_statistics(void) -{ - uint32_t i; - uint32_t id = 0; - - kprintf("Scheduling statistics:\n"); - kprintf("======================\n"); - kprintf("total ticks:\t%llu\n", get_clock_tick()); - for(i=0; iprio; + if (!runqueues[core_id].queue[prio-1].first) { + old->prev = NULL; + runqueues[core_id].queue[prio-1].first = runqueues[core_id].queue[prio-1].last = old; + } else { + old->prev = runqueues[core_id].queue[prio-1].last; + runqueues[core_id].queue[prio-1].last->next = old; + runqueues[core_id].queue[prio-1].last = old; + } + runqueues[core_id].old_task = NULL; + runqueues[core_id].prio_bitmap |= (1 << prio); + old->next = NULL; + } + spinlock_unlock(&runqueues[core_id].lock); + + irq_enable(); +} + /** @brief Wakeup tasks which are waiting for a message from the current one * * @param result Current task's resulting return value @@ -145,6 +162,7 @@ static void wakeup_blocked_tasks(int result) static void NORETURN do_exit(int arg) { vma_t* tmp; task_t* curr_task = per_core(current_task); + uint32_t flags, core_id; kprintf("Terminate task: %u, return value %d\n", curr_task->id, arg); @@ -168,6 +186,15 @@ static void NORETURN do_exit(int arg) { kprintf("Memory leak! Task %d did not release %d pages\n", curr_task->id, atomic_int32_read(&curr_task->user_usage)); curr_task->status = TASK_FINISHED; + + // decrease the number of active tasks + flags = irq_nested_disable(); + core_id = CORE_ID; + spinlock_lock(&runqueues[core_id].lock); + runqueues[core_id].nr_tasks--; + spinlock_unlock(&runqueues[core_id].lock); + irq_nested_enable(flags); + reschedule(); kprintf("Kernel panic: scheduler on core %d found no valid task\n", CORE_ID); @@ -203,17 +230,22 @@ void NORETURN abort(void) { * - 0 on success * - -ENOMEM (-12) or -EINVAL (-22) on failure */ -static int create_task(tid_t* id, internal_entry_point_t ep, void* arg) +static int create_task(tid_t* id, internal_entry_point_t ep, void* arg, uint8_t prio) { task_t* curr_task; int ret = -ENOMEM; - unsigned int i; + unsigned int i, core_id; if (BUILTIN_EXPECT(!ep, 0)) return -EINVAL; + if (BUILTIN_EXPECT(prio == IDLE_PRIO, 0)) + return -EINVAL; + if (BUILTIN_EXPECT(prio > MAX_PRIO, 0)) + return -EINVAL; spinlock_irqsave_lock(&table_lock); + core_id = CORE_ID; curr_task = per_core(current_task); for(i=0; inext = task_table+i; + runqueues[core_id].queue[prio-1].last = task_table+i; + task_table[i].next = NULL; + } + spinlock_unlock(&runqueues[core_id].lock); break; } } @@ -261,7 +311,7 @@ create_task_out: int sys_fork(void) { int ret = -ENOMEM; - unsigned int i, fd_i; + unsigned int i, core_id, fd_i; task_t* parent_task = per_core(current_task); vma_t** child; vma_t* parent; @@ -270,6 +320,8 @@ int sys_fork(void) spinlock_lock(&parent_task->vma_lock); spinlock_irqsave_lock(&table_lock); + core_id = CORE_ID; + for(i=0; iid] = &parent_task->inbox; - task_table[i].flags = parent_task->flags & ~TASK_SWITCH_IN_PROGRESS; + task_table[i].flags = parent_task->flags; memcpy(&(task_table[i].fpu), &(parent_task->fpu), sizeof(union fpu_state)); task_table[i].start_tick = get_clock_tick(); task_table[i].start_heap = 0; task_table[i].end_heap = 0; task_table[i].lwip_err = 0; + task_table[i].prio = parent_task->prio; + task_table[i].last_core = parent_task->last_core; + + // add task in the runqueue + spinlock_lock(&runqueues[core_id].lock); + runqueues[core_id].prio_bitmap |= (1 << parent_task->prio); + runqueues[core_id].nr_tasks++; + if (!runqueues[core_id].queue[parent_task->prio-1].first) { + task_table[i].prev = NULL; + runqueues[core_id].queue[parent_task->prio-1].first = task_table+i; + runqueues[core_id].queue[parent_task->prio-1].last = task_table+i; + task_table[i].next = NULL; + } else { + task_table[i].prev = runqueues[core_id].queue[parent_task->prio-1].last; + runqueues[core_id].queue[parent_task->prio-1].last->next = task_table+i; + runqueues[core_id].queue[parent_task->prio-1].last = task_table+i; + task_table[i].next = NULL; + } + spinlock_unlock(&runqueues[core_id].lock); ret = arch_fork(task_table+i); @@ -325,13 +396,7 @@ int sys_fork(void) // Leave the function without releasing the locks // because the locks are already released // by the parent task! -#if MAX_CORES > 1 - task_t* old = per_core(old_task); - - if (old) - old->flags &= ~TASK_SWITCH_IN_PROGRESS; -#endif - irq_enable(); + finish_task_switch(); return 0; } @@ -365,13 +430,8 @@ static int STDCALL kernel_entry(void* args) { int ret; kernel_args_t* kernel_args = (kernel_args_t*) args; -#if MAX_CORES > 1 - task_t* old = per_core(old_task); - if (old) - old->flags &= ~TASK_SWITCH_IN_PROGRESS; -#endif - irq_enable(); + finish_task_switch(); if (BUILTIN_EXPECT(!kernel_args, 0)) return -EINVAL; @@ -383,7 +443,7 @@ static int STDCALL kernel_entry(void* args) return ret; } -int create_kernel_task(tid_t* id, entry_point_t ep, void* args) +int create_kernel_task(tid_t* id, entry_point_t ep, void* args, uint8_t prio) { kernel_args_t* kernel_args; @@ -394,7 +454,10 @@ int create_kernel_task(tid_t* id, entry_point_t ep, void* args) kernel_args->func = ep; kernel_args->args = args; - return create_task(id, kernel_entry, kernel_args); + if (prio > MAX_PRIO) + prio = NORMAL_PRIO; + + return create_task(id, kernel_entry, kernel_args, prio); } #define MAX_ARGS (PAGE_SIZE - 2*sizeof(int) - sizeof(vfs_node_t*)) @@ -631,13 +694,8 @@ invalid: static int STDCALL user_entry(void* arg) { int ret; -#if MAX_CORES > 1 - task_t* old = per_core(old_task); - if (old) - old->flags &= ~TASK_SWITCH_IN_PROGRESS; -#endif - irq_enable(); + finish_task_switch(); if (BUILTIN_EXPECT(!arg, 0)) return -EINVAL; @@ -695,7 +753,7 @@ int create_user_task(tid_t* id, const char* fname, char** argv) while ((*dest++ = *src++) != 0); } - return create_task(id, user_entry, load_args); + return create_task(id, user_entry, load_args, NORMAL_PRIO); } /** @brief Used by the execve-Systemcall */ @@ -806,54 +864,320 @@ tid_t wait(int32_t* result) */ int wakeup_task(tid_t id) { + task_t* task; + uint32_t core_id, prio; + uint32_t flags; int ret = -EINVAL; - spinlock_irqsave_lock(&table_lock); + flags = irq_nested_disable(); + + task = task_table + id; + prio = task->prio; + core_id = task->last_core; if (task_table[id].status == TASK_BLOCKED) { task_table[id].status = TASK_READY; ret = 0; + + spinlock_lock(&runqueues[core_id].lock); + // increase the number of ready tasks + runqueues[core_id].nr_tasks++; + + // add task to the runqueue + if (!runqueues[core_id].queue[prio-1].last) { + runqueues[core_id].queue[prio-1].last = runqueues[core_id].queue[prio-1].first = task; + task->next = task->prev = NULL; + runqueues[core_id].prio_bitmap |= (1 << prio); + } else { + task->prev = runqueues[core_id].queue[prio-1].last; + task->next = NULL; + runqueues[core_id].queue[prio-1].last->next = task; + runqueues[core_id].queue[prio-1].last = task; + } + spinlock_unlock(&runqueues[core_id].lock); } - spinlock_irqsave_unlock(&table_lock); + irq_nested_enable(flags); return ret; } -/* - * we use this struct to guarantee that the id - * has its own cache line - */ -typedef struct { - uint32_t id __attribute__ ((aligned (CACHE_LINE))); - uint8_t gap[CACHE_LINE-sizeof(uint32_t)]; -} last_id_t; - -/** @brief _The_ scheduler procedure +/** @brief Block current task * - * Manages scheduling - right now this is just a round robin scheduler. + * The current task's status will be changed to TASK_BLOCKED + * + * @return + * - 0 on success + * - -EINVAL (-22) on failure */ -void scheduler(void) +int block_current_task(void) +{ + task_t* curr_task; + tid_t id; + uint32_t core_id, prio; + uint32_t flags; + int ret = -EINVAL; + + flags = irq_nested_disable(); + + curr_task = per_core(current_task); + id = curr_task->id; + prio = curr_task->prio; + core_id = CORE_ID; + + if (task_table[id].status == TASK_RUNNING) { + task_table[id].status = TASK_BLOCKED; + ret = 0; + + spinlock_lock(&runqueues[core_id].lock); + // reduce the number of ready tasks + runqueues[core_id].nr_tasks--; + + // remove task from queue + if (task_table[id].prev) + task_table[id].prev->next = task_table[id].next; + if (task_table[id].next) + task_table[id].next->prev = task_table[id].prev; + if (runqueues[core_id].queue[prio-1].first == task_table+id) + runqueues[core_id].queue[prio-1].first = task_table[id].next; + if (runqueues[core_id].queue[prio-1].last == task_table+id) { + runqueues[core_id].queue[prio-1].last = task_table[id].prev; + if (!runqueues[core_id].queue[prio-1].last) + runqueues[core_id].queue[prio-1].last = runqueues[core_id].queue[prio-1].first; + } + + // No valid task in queue => update prio_bitmap + if (!runqueues[core_id].queue[prio-1].first) + runqueues[core_id].prio_bitmap &= ~(1 << prio); + + spinlock_unlock(&runqueues[core_id].lock); + } + + irq_nested_enable(flags); + + return ret; +} + +int set_timer(uint64_t deadline) +{ + task_t* curr_task; + task_t* tmp; + uint32_t core_id, prio; + uint32_t flags; + int ret = -EINVAL; + + flags = irq_nested_disable(); + + curr_task = per_core(current_task); + prio = curr_task->prio; + core_id = CORE_ID; + + if (curr_task->status == TASK_RUNNING) { + curr_task->status = TASK_BLOCKED; + curr_task->timeout = deadline; + ret = 0; + + spinlock_lock(&runqueues[core_id].lock); + + // reduce the number of ready tasks + runqueues[core_id].nr_tasks--; + + // remove task from queue + if (curr_task->prev) + curr_task->prev->next = curr_task->next; + if (curr_task->next) + curr_task->next->prev = curr_task->prev; + if (runqueues[core_id].queue[prio-1].first == curr_task) + runqueues[core_id].queue[prio-1].first = curr_task->next; + if (runqueues[core_id].queue[prio-1].last == curr_task) { + runqueues[core_id].queue[prio-1].last = curr_task->prev; + if (!runqueues[core_id].queue[prio-1].last) + runqueues[core_id].queue[prio-1].last = runqueues[core_id].queue[prio-1].first; + } + + // No valid task in queue => update prio_bitmap + if (!runqueues[core_id].queue[prio-1].first) + runqueues[core_id].prio_bitmap &= ~(1 << prio); + + // add task to the timer queue + tmp = runqueues[core_id].timers.first; + if (!tmp) { + runqueues[core_id].timers.first = runqueues[core_id].timers.last = curr_task; + curr_task->prev = curr_task->next = NULL; + } else { + while(tmp && (deadline >= tmp->timeout)) + tmp = tmp->next; + + if (!tmp) { + curr_task->next = NULL; + curr_task->prev = runqueues[core_id].timers.last; + if (runqueues[core_id].timers.last) + runqueues[core_id].timers.last->next = curr_task; + runqueues[core_id].timers.last = curr_task; + if (!runqueues[core_id].timers.first) + runqueues[core_id].timers.first = curr_task; + } else { + curr_task->prev = tmp->prev; + curr_task->next = tmp; + tmp->prev = curr_task; + if (curr_task->prev) + curr_task->prev->next = curr_task; + if (runqueues[core_id].timers.first == tmp) + runqueues[core_id].timers.first = curr_task; + } + } + + spinlock_unlock(&runqueues[core_id].lock); + } else kprintf("Task is already blocked. No timer will be set!\n"); + + irq_nested_enable(flags); + + return ret; +} + +#define FSHIFT 21 /* nr of bits of precision (e.g. 11) */ +#define FIXED_1 (1< 0) + runqueues[core_id].balance_counter--; + if (runqueues[core_id].load_counter < 0) { + runqueues[core_id].load_counter += 5*TIMER_FREQ; + + spinlock_lock(&runqueues[core_id].lock); + runqueues[core_id].load *= EXP; + runqueues[core_id].load += runqueues[core_id].nr_tasks*(FIXED_1-EXP); + runqueues[core_id].load >>= FSHIFT; + spinlock_unlock(&runqueues[core_id].lock); + + //kprintf("load of core %u: %u, %u\n", core_id, runqueues[core_id].load, runqueues[core_id].nr_tasks); + } +} + +#if MAX_CORES > 1 +extern atomic_int32_t cpu_online; + +void load_balancing(void) +{ +#if 0 + uint32_t i, core_id = CORE_ID; + uint32_t prio; + task_t* task; + + spinlock_lock(&runqueues[core_id].lock); + for(i=0; (i runqueues[core_id].load) { + kprintf("Try to steal a task from core %u (load %u) to %u (load %u)\n", i, runqueues[i].load, core_id, runqueues[core_id].load); + kprintf("Task on core %u: %u, core %u, %u\n", i, runqueues[i].nr_tasks, core_id, runqueues[i].nr_tasks); + + prio = last_set(runqueues[i].prio_bitmap); + if (prio) { + // steal a ready task + task = runqueues[i].queue[prio-1].last; + kprintf("Try to steal a ready task %d\n", task->id); + + // remove last element from queue i + if (task->prev) + task->prev->next = NULL; + runqueues[i].queue[prio-1].last = task->prev; + if (!runqueues[i].queue[prio-1].last) + runqueues[i].queue[prio-1].first = NULL; + + // add task at the end of queue core_id + if (!runqueues[core_id].queue[prio-1].last) { + runqueues[core_id].queue[prio-1].first = runqueues[core_id].queue[prio-1].last = task; + task->next = task->prev = NULL; + } else { + runqueues[core_id].queue[prio-1].last->next = task; + task->prev = runqueues[core_id].queue[prio-1].last; + runqueues[core_id].queue[prio-1].last = task; + task->next = NULL; + } + + // update task counters + runqueues[core_id].nr_tasks++; + runqueues[i].nr_tasks--; + runqueues[core_id].balance_counter = 5*TIMER_FREQ; + } else { + task_t* tmp; + + // steal a blocked task + task = runqueues[i].timers.first; + if (!task) // Ups, found no valid task to steal + goto no_task_found; + + kprintf("Try to steal blocked task %d\n", task->id); + + // remove first timer from queue i + if (runqueues[i].timers.first == runqueues[i].timers.last) + runqueues[i].timers.first = runqueues[i].timers.last = NULL; + else + runqueues[i].timers.first = runqueues[i].timers.first->next; + + // add timer to queue core_id + tmp = runqueues[core_id].timers.first; + while(tmp && (task->timeout >= tmp->timeout)) + tmp = tmp->next; + + if (!tmp) { + task->next = NULL; + task->prev = runqueues[core_id].timers.last; + if (runqueues[core_id].timers.last) + runqueues[core_id].timers.last->next = task; + runqueues[core_id].timers.last = task; + if (!runqueues[core_id].timers.first) + runqueues[core_id].timers.first = task; + } else { + task->prev = tmp->prev; + task->next = tmp; + tmp->prev = task; + if (task->prev) + task->prev->next = task; + if (runqueues[core_id].timers.first == tmp) + runqueues[core_id].timers.first = task; + } + + // => reschedule on the new core + task->last_core = CORE_ID; + + // update task counters + runqueues[core_id].nr_tasks++; + runqueues[i].nr_tasks--; + runqueues[core_id].balance_counter = 5*TIMER_FREQ; + } + } +no_task_found: + spinlock_unlock(&runqueues[i].lock); + } + spinlock_unlock(&runqueues[core_id].lock); +#endif +} +#endif + +void scheduler(void) { task_t* orig_task; task_t* curr_task; - uint32_t i; - uint32_t new_id; + uint32_t core_id = CORE_ID; + uint32_t prio; uint64_t current_tick; - static last_id_t last_id = { 0 }; -#if MAX_CORES > 1 - spinlock_irqsave_lock(&table_lock); -#endif - current_tick = get_clock_tick(); orig_task = curr_task = per_core(current_task); - - /* increase the number of used time slices */ - curr_task->time_slices++; + curr_task->last_core = core_id; /* signalizes that this task could be reused */ if (curr_task->status == TASK_FINISHED) - curr_task->status = TASK_INVALID; + curr_task->status = TASK_INVALID; /* if the task is using the FPU, we need to save the FPU context */ if (curr_task->flags & TASK_FPU_USED) { @@ -861,64 +1185,87 @@ void scheduler(void) curr_task->flags &= ~TASK_FPU_USED; } - for(i=0, new_id=(last_id.id + 1) % MAX_TASKS; - itimeout <= current_tick) { - if (task_table[new_id].flags & TASK_TIMER_USED) { - if (task_table[new_id].status != TASK_BLOCKED) - task_table[new_id].flags &= ~TASK_TIMER_USED; - if ((task_table[new_id].status == TASK_BLOCKED) && (current_tick >= task_table[new_id].timeout)) { - task_table[new_id].flags &= ~TASK_TIMER_USED; - task_table[new_id].status = TASK_READY; - } - } + task_t* task = runqueues[core_id].timers.first; - if ((task_table[new_id].status == TASK_READY) && !(task_table[new_id].flags & TASK_SWITCH_IN_PROGRESS)) { - if (curr_task->status == TASK_RUNNING) { - curr_task->status = TASK_READY; -#if MAX_CORES > 1 - curr_task->flags |= TASK_SWITCH_IN_PROGRESS; - per_core(old_task) = curr_task; -#endif - } -#if MAX_CORES > 1 - else per_core(old_task) = NULL; -#endif - task_table[new_id].status = TASK_RUNNING; - curr_task = per_core(current_task) = task_table+new_id; - last_id.id = new_id; + // remove timer from queue + runqueues[core_id].timers.first = runqueues[core_id].timers.first->next; + if (!runqueues[core_id].timers.first) + runqueues[core_id].timers.last = NULL; - goto get_task_out; + // wakeup task + if (task->status == TASK_BLOCKED) { + task->status = TASK_READY; + prio = task->prio; + + // increase the number of ready tasks + runqueues[core_id].nr_tasks++; + + // add task to the runqueue + if (!runqueues[core_id].queue[prio-1].first) { + runqueues[core_id].queue[prio-1].last = runqueues[core_id].queue[prio-1].first = task; + task->next = task->prev = NULL; + runqueues[core_id].prio_bitmap |= (1 << prio); + } else { + task->prev = runqueues[core_id].queue[prio-1].last; + task->next = NULL; + runqueues[core_id].queue[prio-1].last->next = task; + runqueues[core_id].queue[prio-1].last = task; + } } } + runqueues[core_id].old_task = NULL; // reset old task + prio = last_set(runqueues[core_id].prio_bitmap); // determines highest priority #if MAX_CORES > 1 - per_core(old_task) = NULL; + if (!prio) { + load_balancing(); + prio = last_set(runqueues[core_id].prio_bitmap); // retry... + } #endif - if ((curr_task->status == TASK_RUNNING) || (curr_task->status == TASK_IDLE)) - goto get_task_out; + if (BUILTIN_EXPECT(prio > MAX_PRIO, 0)) { + kprintf("Invalid priority %u by bitmap 0x%x\n", prio, runqueues[core_id].prio_bitmap); + prio = 0; + } - /* - * we switch to the idle task, if the current task terminates - * and no other is ready - */ - new_id = CORE_ID; - curr_task = per_core(current_task) = task_table+CORE_ID; + if (!prio) { + if ((curr_task->status == TASK_RUNNING) || (curr_task->status == TASK_IDLE)) + goto get_task_out; + curr_task = per_core(current_task) = runqueues[core_id].idle; + } else { + // Does the current task have an higher priority? => no task switch + if ((curr_task->prio > prio) && (curr_task->status == TASK_RUNNING)) + goto get_task_out; + + if (curr_task->status == TASK_RUNNING) { + curr_task->status = TASK_READY; + runqueues[core_id].old_task = curr_task; + } + + curr_task = per_core(current_task) = runqueues[core_id].queue[prio-1].first; + curr_task->status = TASK_RUNNING; + + // remove new task from queue + runqueues[core_id].queue[prio-1].first = curr_task->next; + if (!curr_task->next) { + runqueues[core_id].queue[prio-1].last = NULL; + runqueues[core_id].prio_bitmap &= ~(1 << prio); + } + } get_task_out: -#if MAX_CORES > 1 - spinlock_irqsave_unlock(&table_lock); -#endif + spinlock_unlock(&runqueues[core_id].lock); if (curr_task != orig_task) { - //kprintf("schedule from %d to %d on core %d\n", orig_task->id, curr_task->id, smp_id()); - switch_task(new_id); -#if MAX_CORES > 1 - orig_task= per_core(old_task); - if (orig_task) - orig_task->flags &= ~TASK_SWITCH_IN_PROGRESS; -#endif + //kprintf("schedule from %u to %u with prio %u on core %u\n", + // orig_task->id, curr_task->id, (uint32_t)curr_task->prio, CORE_ID); + switch_task(curr_task->id); } } diff --git a/kernel/tests.c b/kernel/tests.c index 044e6f93..01a07a77 100644 --- a/kernel/tests.c +++ b/kernel/tests.c @@ -24,12 +24,15 @@ #include #include #include +#include +#include #ifdef CONFIG_ROCKCREEK #include #include #include #include #include +#include #include #include @@ -111,6 +114,139 @@ int mail_ping(void* arg) { return 0; } + +#define N 1024 +//#define N 514 +#define LAZY + +volatile static int* A[N]; +volatile static int* B[N]; +volatile static int* C[N]; + +static int svm_test(void *arg) +{ + uint64_t start, end; + uint32_t i, j, k; + int my_ue, num_ues; + + RCCE_barrier(&RCCE_COMM_WORLD); + my_ue = RCCE_ue(); + num_ues = RCCE_num_ues(); + +#if 1 + if (!my_ue) { + // allocate and initialize SVM region + A[0] = (int*) kmalloc(3*N*N*sizeof(int)); + memset((void*) A[0], 0x00, 3*N*N*sizeof(int)); + + // initialize matrices + for(i=0; i> PAGE_SHIFT); if (addr > addr + PAGE_SIZE) break; @@ -219,14 +219,14 @@ int mmu_init(void) * Now, we are able to read the FPGA registers and to * determine the number of slots for private memory. */ - uint32_t slots = *((volatile uint32_t*) (FPGA_BASE + 0x8244)); + uint32_t slots = *((volatile uint8_t*) (FPGA_BASE + 0x8244)); if (slots == 0) - slots = 21; + slots = 1; kprintf("MetalSVM use %d slots for private memory\n", slots); // define the residual private slots as free - for(addr=20*0x1000000; addr<(slots-1)*0x1000000; addr+=PAGE_SIZE) { + for(addr=1*0x1000000; addr> PAGE_SHIFT); if (addr > addr + PAGE_SIZE) break;