From 04a0dfb114cce9c17042e350cd56735fb679cc98 Mon Sep 17 00:00:00 2001 From: Stefan Lankes Date: Mon, 18 Apr 2011 15:03:35 +0200 Subject: [PATCH 1/5] enable the support of the system call "fork" remove obsolete file --- arch/x86/kernel/schedule.asm | 147 ----------------------------------- newlib/examples/tests.c | 2 +- 2 files changed, 1 insertion(+), 148 deletions(-) delete mode 100644 arch/x86/kernel/schedule.asm diff --git a/arch/x86/kernel/schedule.asm b/arch/x86/kernel/schedule.asm deleted file mode 100644 index 0a4b0286..00000000 --- a/arch/x86/kernel/schedule.asm +++ /dev/null @@ -1,147 +0,0 @@ -; -; Copyright 2010 Stefan Lankes, Chair for Operating Systems, -; RWTH Aachen University -; -; Licensed under the Apache License, Version 2.0 (the "License"); -; you may not use this file except in compliance with the License. -; You may obtain a copy of the License at -; -; http://www.apache.org/licenses/LICENSE-2.0 -; -; Unless required by applicable law or agreed to in writing, software -; distributed under the License is distributed on an "AS IS" BASIS, -; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -; See the License for the specific language governing permissions and -; limitations under the License. -; -; This file is part of metalsvm. - -[BITS 32] -SECTION .code -extern get_new_task -extern current_task -;extern task_table - -; C prototyp: -; -; typedef struct { -; unsigned char* top; -; unsigned int ip; -; tid_t id; -; ... -; } task_t; - -; After an interrupt, the original return address has to be pushed -; on the stack because we want to comeback... -global schedule_entry -schedule_entry: - cli - sub esp, 4 ; add some space for the return address, - ; which is stored in current_task->ip - - pushfd - pusha - push ds - push es - push fs - push gs - - mov ax, 0x10 - mov ds, ax - mov es, ax - mov fs, ax - mov gs, ax - - ;mov eax, DWORD [current_id] - ;shl eax, 0x4 ; 16 = sizeof(task_t) - ;mov edx, DWORD [eax+task_table+4] - ;mov [esp+13*4], edx - ;mov DWORD [eax+task_table+4], 0 - mov eax, DWORD [current_task] - mov edx, [eax+4] - mov DWORD [esp+13*4], edx - mov DWORD [eax+4], 0x00 - - jmp L1 - -; Scheduler, which switches to the new tasks -global schedule -schedule: - ret - - cli - ; return address is already on the stack (via call schedule) - pop eax - pushfd - push 0x08 - push eax - push byte 0 - push byte 32 - pusha - push ds - push es - push fs - push gs - - mov ax, 0x10 - mov ds, ax - mov es, ax - mov fs, ax - mov gs, ax - - mov eax, DWORD [current_task] - mov [eax], esp - - ;call get_new_task - - mov DWORD [current_task], eax - mov esp, [eax] - - pop gs - pop fs - pop es - pop ds - popa - add esp, 8 - xor eax, eax ; return value is 0 - iret - - cli - pushfd - pusha - push ds - push es - push fs - push gs - - mov ax, 0x10 - mov ds, ax - mov es, ax - mov fs, ax - mov gs, ax - - ; store current esp - ;mov eax, DWORD [current_id] - ;shl eax, 0x4 ; 16 = sizeof(task_t) -L1: - ;mov [eax+task_table], esp - mov eax, DWORD [current_task] - mov [eax], esp - - ;call get_new_task - - ; set current task and restore esp - ;mov DWORD [current_id], eax - ;shl eax, 0x4 ; 16 = sizeof(task_t) - ;mov esp, [eax+task_table] - mov DWORD [current_task], eax - mov esp, [eax] - - pop gs - pop fs - pop es - pop ds - popa - popfd - sti - ret diff --git a/newlib/examples/tests.c b/newlib/examples/tests.c index 7288ed82..e2bc27d3 100644 --- a/newlib/examples/tests.c +++ b/newlib/examples/tests.c @@ -34,7 +34,7 @@ int main(int argc, char** argv) printf("Create child process...\n"); - pid = 42; //fork(); + pid = fork(); if (pid == 0) { // child char* newargs[] = {"/bin/hello", "one", "two", "three", NULL}; char* newenv[] = {"USER=root", "PATH=/bin:/sbin:/usr/bin", "PWD=/", "TEMP=/tmp", NULL}; From 624afd187cc40487894fba7e291dbc5e01ad137f Mon Sep 17 00:00:00 2001 From: Stefan Lankes Date: Mon, 18 Apr 2011 15:05:27 +0200 Subject: [PATCH 2/5] by entering a system call, we need also to push the segment descriptor on the stack --- arch/x86/kernel/entry.asm | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/entry.asm b/arch/x86/kernel/entry.asm index f24b33dd..1a325dba 100644 --- a/arch/x86/kernel/entry.asm +++ b/arch/x86/kernel/entry.asm @@ -145,7 +145,6 @@ global isr29 global isr30 global isr31 global isrsyscall -global jump_to_child ; 0: Divide By Zero Exception isr0: @@ -433,6 +432,10 @@ extern syscall_handler ; used to realize system calls isrsyscall: + push ds + push fs + push gs + push es push ebp push edi push esi @@ -449,17 +452,10 @@ isrsyscall: pop esi pop edi pop ebp - iret - -jump_to_child: - add esp, 4 - mov eax, 0 ; child got always zero as return value - pop ebx - pop ecx - pop edx - pop esi - pop edi - pop ebp + pop es + pop gs + pop fs + pop ds iret global irq0 From 47f37e3b0033dc815719a4a46c9db5a61aceaaff Mon Sep 17 00:00:00 2001 From: Stefan Lankes Date: Mon, 18 Apr 2011 15:07:45 +0200 Subject: [PATCH 3/5] use memory barriers instead of read memory barriers to determine the current TSC => more accurate caclculation of the timer frequency + minor cosmetic changes --- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/apic.c | 37 ++++++++++++++++++-------------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a6ed0c13..ce88fcb6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -140,7 +140,7 @@ static inline void tlb_flush(void) static inline uint32_t read_eflags(void) { uint32_t result; - asm volatile ("pushf; popl %%eax" : "=a"(result) :: "memory"); + asm volatile ("pushf; pop %%eax" : "=a"(result)); return result; } diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 151b9143..d35ae989 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -56,6 +56,7 @@ static apic_mp_t* apic_mp = NULL; static apic_config_table_t* apic_config = NULL; static uint32_t lapic = 0; static volatile ioapic_t* ioapic = NULL; +static uint32_t icr = 0; static uint32_t ncores = 1; static uint8_t irq_redirect[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF}; #if MAX_CORES > 1 @@ -256,19 +257,28 @@ int smp_init(void) static int lapic_reset(void) { - uint32_t max_lvt = apic_lvt_entries(); + uint32_t max_lvt; + + if (!lapic) + return -ENXIO; + + max_lvt = apic_lvt_entries(); lapic_write(APIC_SVR, 0x17F); // enable the apic and connect to the idt entry 127 lapic_write(APIC_TPR, 0x00); // allow all interrupts - lapic_write(APIC_LVT_T, 0x10000); // disable timer interrupt + if (icr) { + lapic_write(APIC_DCR, 0xB); // set it to 1 clock increments + lapic_write(APIC_LVT_T, 0x2007B); // connects the timer to 123 and enables it + lapic_write(APIC_ICR, icr); + } else + lapic_write(APIC_LVT_T, 0x10000); // disable timer interrupt if (max_lvt >= 4) - lapic_write(APIC_LVT_TSR, 0x10000); // disable thermal sensor interrupt + lapic_write(APIC_LVT_TSR, 0x10000); // disable thermal sensor interrupt if (max_lvt >= 5) - lapic_write(APIC_LVT_PMC, 0x10000); // disable performance counter interrupt + lapic_write(APIC_LVT_PMC, 0x10000); // disable performance counter interrupt lapic_write(APIC_LINT0, 0x7C); // connect LINT0 to idt entry 124 lapic_write(APIC_LINT1, 0x7D); // connect LINT1 to idt entry 125 lapic_write(APIC_LVT_ER, 0x7E); // connect error to idt entry 126 - lapic_write(APIC_ID, 0x00); // reset boot processor id return 0; } @@ -328,13 +338,10 @@ int apic_calibration(void) HALT; diff = 0xFFFFFFFFUL - lapic_read(APIC_CCR); - diff = diff / 3; + icr = diff / 3; flags = irq_nested_disable(); lapic_reset(); - lapic_write(APIC_DCR, 0xB); // set it to 1 clock increments - lapic_write(APIC_LVT_T, 0x2007B); // connects the timer to 123 and enables it - lapic_write(APIC_ICR, diff); irq_nested_enable(flags); // Now, MetalSVM is able to use the APIC => Therefore, we disable the PIC @@ -353,26 +360,23 @@ int apic_calibration(void) lapic_write(APIC_ICR, 0xFFFFFFFFUL); /* wait 3 time slices to determine a ICR */ - rmb(); + mb(); start = rdtsc(); do { - rmb(); + mb(); end = rdtsc(); ticks = end > start ? end - start : start - end; } while(ticks*TIMER_FREQ < 3*RC_REFCLOCKMHZ*1000000UL); diff = 0xFFFFFFFFUL - lapic_read(APIC_CCR); - diff = diff / 3; + icr = diff / 3; lapic_reset(); - lapic_write(APIC_DCR, 0xB); // set it to 1 clock increments - lapic_write(APIC_LVT_T, 0x2007B); // connects the timer to 123 and enables it - lapic_write(APIC_ICR, diff); irq_nested_enable(flags); #endif - kprintf("APIC calibration determines an ICR of 0x%x\n", diff); + kprintf("APIC calibration determines an ICR of 0x%x\n", icr); flags = irq_nested_disable(); #if MAX_CORES > 1 @@ -568,6 +572,7 @@ int apic_init(void) // set APIC error handler irq_install_handler(126, apic_err_handler); + // initialize local apic ret = lapic_reset(); if (!ret) return ret; From 68281c8ad0020f2923dc2361234ecd79948cd118 Mon Sep 17 00:00:00 2001 From: Stefan Lankes Date: Mon, 18 Apr 2011 15:10:24 +0200 Subject: [PATCH 4/5] cosmetic changes, minor code optimization --- arch/x86/kernel/gdt.c | 6 +++--- arch/x86/kernel/irq.c | 8 +++++--- arch/x86/kernel/processor.c | 6 +++--- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/gdt.c b/arch/x86/kernel/gdt.c index b4fd61ae..30e165c7 100644 --- a/arch/x86/kernel/gdt.c +++ b/arch/x86/kernel/gdt.c @@ -68,9 +68,6 @@ int arch_fork(task_t* task) memcpy(task_state_segments+id, task_state_segments+curr_task->id, sizeof(tss_t)); memcpy(kstacks[id], kstacks[curr_task->id], KERNEL_STACK_SIZE); task_state_segments[id].cr3 = (uint32_t) (virt_to_phys((size_t)task->pgd)); - task_state_segments[id].eflags = read_eflags(); - // the parent task will enable the IF flag - task_state_segments[id].eflags |= (1 << 9); task_state_segments[id].esp0 = (uint32_t) kstacks[id] + KERNEL_STACK_SIZE - sizeof(size_t); asm volatile("mov %%esp, %0" : "=r"(task_state_segments[id].esp)); @@ -91,6 +88,9 @@ int arch_fork(task_t* task) asm volatile ("pop %0" : "=r"(task_state_segments[id].ecx)); asm volatile ("pop %0" : "=r"(task_state_segments[id].eax)); + // store current EFLAGS and set IF flag + // => the parent task will enable the interrupt handling + task_state_segments[id].eflags = read_eflags() | (1 << 9); // This will be the entry point for the new task. task_state_segments[id].eip = read_eip(); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 708061dd..3542199f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -226,9 +226,11 @@ void irq_handler(struct state *s) * Find out if we have a custom handler to run for this * IRQ and then finally, run it */ - handler = irq_routines[s->int_no]; - if (handler) - handler(s); + if (BUILTIN_EXPECT(s->int_no < MAX_HANDLERS, 1)) { + handler = irq_routines[s->int_no]; + if (handler) + handler(s); + } else kprintf("Invalid interrupt number %d\n", s->int_no); /* * If the IDT entry that was invoked was greater-than-or-equal to 48, diff --git a/arch/x86/kernel/processor.c b/arch/x86/kernel/processor.c index 2b1c84bf..aa4988f4 100644 --- a/arch/x86/kernel/processor.c +++ b/arch/x86/kernel/processor.c @@ -49,12 +49,12 @@ uint32_t detect_cpu_frequency(void) while((ticks = get_clock_tick()) - old == 0) HALT; - rmb(); + mb(); start = rdtsc(); /* wait a second to determine the frequency */ while(get_clock_tick() - ticks < TIMER_FREQ) HALT; - rmb(); + mb(); end = rdtsc(); diff = end > start ? end - start : start - end; @@ -78,7 +78,7 @@ void udelay(uint32_t usecs) uint64_t deadline = get_cpu_frequency() * usecs; do { - rmb(); + mb(); end = rdtsc(); diff = end > start ? end - start : start - end; } while(diff < deadline); From e94c31d03b0be3e3143febfa75c2ea718284b8f4 Mon Sep 17 00:00:00 2001 From: Stefan Lankes Date: Mon, 18 Apr 2011 15:12:18 +0200 Subject: [PATCH 5/5] add a new IRQ save spinlock implementation - avoids a deadlock - usable in a interrupt handler --- include/metalsvm/spinlock.h | 86 +++++++++++++++++++++---------- include/metalsvm/spinlock_types.h | 11 +++- include/metalsvm/stddef.h | 2 + kernel/tasks.c | 44 ++++++++++------ 4 files changed, 97 insertions(+), 46 deletions(-) diff --git a/include/metalsvm/spinlock.h b/include/metalsvm/spinlock.h index e6fcd15f..1d62c968 100644 --- a/include/metalsvm/spinlock.h +++ b/include/metalsvm/spinlock.h @@ -26,6 +26,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -44,8 +45,12 @@ inline static int spinlock_init(spinlock_t* s) { } inline static int spinlock_destroy(spinlock_t* s) { + if (BUILTIN_EXPECT(!s, 0)) + return -EINVAL; + s->owner = MAX_TASKS; s->counter = 0; + return 0; } @@ -61,8 +66,9 @@ inline static int spinlock_lock(spinlock_t* s) { } ticket = atomic_int32_inc(&s->queue); - while(atomic_int32_read(&s->dequeue) != ticket) - ; + while(atomic_int32_read(&s->dequeue) != ticket) { + NOP1; + } s->owner = per_core(current_task)->id; s->counter = 1; @@ -82,44 +88,70 @@ inline static int spinlock_unlock(spinlock_t* s) { return 0; } -inline static int spinlock_lock_irqsave(spinlock_t* s) { - uint32_t flags; - int ret; - +inline static int spinlock_irqsave_init(spinlock_irqsave_t* s) { if (BUILTIN_EXPECT(!s, 0)) return -EINVAL; - - flags = irq_nested_disable(); - ret = spinlock_lock(s); - if (ret) { - irq_nested_enable(flags); - return ret; - } + atomic_int32_set(&s->queue, 0); + atomic_int32_set(&s->dequeue, 1); + s->flags = 0; + s->coreid = (uint32_t)-1; + s->counter = 0; - if (!ret && (s->counter == 1)) - s->flags = flags; - - return ret; + return 0; } -inline static int spinlock_unlock_irqsave(spinlock_t* s) { - int ret, restore = 0; - uint32_t flags = 0; +inline static int spinlock_irqsave_destroy(spinlock_irqsave_t* s) { + if (BUILTIN_EXPECT(!s, 0)) + return -EINVAL; + + s->flags = 0; + s->coreid = (uint32_t)-1; + s->counter = 0; + + return 0; +} + +inline static int spinlock_irqsave_lock(spinlock_irqsave_t* s) { + uint32_t flags; + int32_t ticket; if (BUILTIN_EXPECT(!s, 0)) return -EINVAL; - if (s->counter == 1) { - restore = 1; - flags = s->flags; + flags = irq_nested_disable(); + if (s->coreid == CORE_ID) { + s->counter++; + return 0; } - ret = spinlock_unlock(s); - if (!ret && restore) - irq_nested_enable(flags); + ticket = atomic_int32_inc(&s->queue); + while (atomic_int32_read(&s->dequeue) != ticket) + NOP1; + + s->coreid = CORE_ID; + s->flags = flags; + s->counter = 1; + + return 0; +} - return ret; +inline static int spinlock_irqsave_unlock(spinlock_irqsave_t* s) { + uint32_t flags; + + if (BUILTIN_EXPECT(!s, 0)) + return -EINVAL; + + s->counter--; + if (!s->counter) { + flags = s->flags; + s->coreid = (uint32_t) -1; + s->flags = 0; + atomic_int32_inc(&s->dequeue); + irq_nested_enable(flags); + } + + return 0; } #ifdef __cplusplus diff --git a/include/metalsvm/spinlock_types.h b/include/metalsvm/spinlock_types.h index bac2f5df..5105f4d2 100644 --- a/include/metalsvm/spinlock_types.h +++ b/include/metalsvm/spinlock_types.h @@ -31,10 +31,17 @@ typedef struct spinlock { atomic_int32_t queue, dequeue; tid_t owner; uint32_t counter; - uint32_t flags; } spinlock_t; -#define SPINLOCK_INIT { ATOMIC_INIT(0), ATOMIC_INIT(1), MAX_TASKS, 0, 0 } +typedef struct spinlock_irqsave { + atomic_int32_t queue, dequeue; + uint32_t flags; + uint32_t coreid; + uint32_t counter; +} spinlock_irqsave_t; + +#define SPINLOCK_INIT { ATOMIC_INIT(0), ATOMIC_INIT(1), MAX_TASKS, 0} +#define SPINLOCK_IRQSAVE_INIT { ATOMIC_INIT(0), ATOMIC_INIT(1), 0, (uint32_t)-1, 0} #ifdef __cplusplus } diff --git a/include/metalsvm/stddef.h b/include/metalsvm/stddef.h index 2dfc9ba6..07d3ed27 100644 --- a/include/metalsvm/stddef.h +++ b/include/metalsvm/stddef.h @@ -35,6 +35,7 @@ typedef unsigned int tid_t; #define per_core(name) name #define DECLARE_PER_CORE(type, name) extern type name; #define DEFINE_PER_CORE(type, name, def_value) type name = def_value; +#define CORE_ID 0 #else #define per_core(name) name[LOGICAL_CPUID].var #define DECLARE_PER_CORE(type, name) \ @@ -42,6 +43,7 @@ typedef unsigned int tid_t; extern aligned_##name name[MAX_CORES]; #define DEFINE_PER_CORE(type, name, def_value) \ aligned_##name name[MAX_CORES] = {[0 ... MAX_CORES-1] = {def_value}}; +#define CORE_ID LOGICAL_CPUID #endif /* needed to find the task, which is currently running on this core */ diff --git a/kernel/tasks.c b/kernel/tasks.c index 3f7a1b7a..92e3ff2b 100644 --- a/kernel/tasks.c +++ b/kernel/tasks.c @@ -29,12 +29,13 @@ #include #include #include +#include #include DEFINE_PER_CORE(task_t*, current_task, NULL); static task_t task_table[MAX_TASKS] = {[0 ... MAX_TASKS-1] = {0, TASK_INVALID, ATOMIC_INIT(0), \ SPINLOCK_INIT, NULL, SPINLOCK_INIT, NULL}}; -static spinlock_t table_lock = SPINLOCK_INIT; +static spinlock_irqsave_t table_lock = SPINLOCK_IRQSAVE_INIT; /* * helper function for the assembly code to determine the current task @@ -63,7 +64,7 @@ static void wakeup_blocked_tasks(int result) wait_msg_t tmp = { per_core(current_task)->id, result }; unsigned int i; - spinlock_lock_irqsave(&table_lock); + spinlock_irqsave_lock(&table_lock); /* wake up blocked tasks */ for(i=0; ivma_lock); + spinlock_irqsave_lock(&table_lock); for(i=0; ivma_lock); child = &task_table[i].vma_list; parent = per_core(current_task)->vma_list; tmp = NULL; @@ -211,7 +212,6 @@ int sys_fork(void) tmp = *child; child = &((*child)->next); } - spinlock_unlock(&per_core(current_task)->vma_lock); mailbox_wait_msg_init(&task_table[i].inbox); memset(task_table[i].outbox, 0x00, sizeof(mailbox_wait_msg_t*)*MAX_TASKS); @@ -219,8 +219,13 @@ int sys_fork(void) ret = arch_fork(task_table+i); - if (parent_task != per_core(current_task)) - return 0; // Oh, the new child! => leave function + if (parent_task != per_core(current_task)) { + // Oh, the current task is the new child task! + // Leave the function without releasing the locks + // because the locks are already released + // by the parent task! + return 0; + } if (!ret) { task_table[i].status = TASK_READY; @@ -231,7 +236,8 @@ int sys_fork(void) } create_task_out: - spinlock_unlock_irqsave(&table_lock); + spinlock_irqsave_unlock(&table_lock); + spinlock_unlock(&per_core(current_task)->vma_lock); return ret; } @@ -576,7 +582,7 @@ int wakeup_task(tid_t id) int ret = -EINVAL; /* avoid nested locking */ - spinlock_lock_irqsave(&table_lock); + spinlock_irqsave_lock(&table_lock); if (task_table[id].status != TASK_BLOCKED) { kprintf("Task %d is not blocked!\n", id); @@ -585,7 +591,7 @@ int wakeup_task(tid_t id) ret = 0; } - spinlock_unlock_irqsave(&table_lock); + spinlock_irqsave_unlock(&table_lock); return ret; } @@ -594,14 +600,14 @@ int block_task(tid_t id) { int ret = -EINVAL; - spinlock_lock_irqsave(&table_lock); + spinlock_irqsave_lock(&table_lock); if ((task_table[id].status == TASK_RUNNING) || (task_table[id].status == TASK_READY)) { task_table[id].status = TASK_BLOCKED; ret = 0; } else kprintf("Unable to block task %d!\n", id); - spinlock_unlock_irqsave(&table_lock); + spinlock_irqsave_unlock(&table_lock); return ret; } @@ -611,7 +617,9 @@ void scheduler(void) unsigned int i; unsigned int new_id; - spinlock_lock(&table_lock); +#if MAX_CORES > 1 + spinlock_irqsave_lock(&table_lock); +#endif /* signalize that this task could be reused */ if (per_core(current_task)->status == TASK_FINISHED) @@ -645,5 +653,7 @@ void scheduler(void) } get_task_out: - spinlock_unlock(&table_lock); +#if MAX_CORES > 1 + spinlock_irqsave_unlock(&table_lock); +#endif }