/* * Copyright 2010 Stefan Lankes, Chair for Operating Systems, * RWTH Aachen University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This file is part of MetalSVM. */ #include #include #include #include #include #include #include #include #include gdt_ptr_t gp; static tss_t task_state_segments[MAX_CORES] __attribute__ ((aligned (PAGE_SIZE))); // currently, our kernel has full access to the ioports static gdt_entry_t gdt[GDT_ENTRIES] = {[0 ... GDT_ENTRIES-1] = {0, 0, 0, 0, 0, 0}}; /* * This is defined in entry.asm. We use this to properly reload * the new segment registers */ extern void gdt_flush(void); size_t* get_current_stack(void) { task_t* curr_task = per_core(current_task); // determine and set esp0 #ifdef CONFIG_X86_32 task_state_segments[CORE_ID].esp0 = (size_t) curr_task->stack + KERNEL_STACK_SIZE - 16; // => stack is 16byte aligned #else task_state_segments[CORE_ID].rsp0 = (size_t) curr_task->stack + KERNEL_STACK_SIZE - 16; // => stack is 16byte aligned #endif // use new page table write_cr3(virt_to_phys((size_t) curr_task->page_map)); return curr_task->last_stack_pointer; } int arch_fork(task_t* task) { struct state* state; task_t* curr_task = per_core(current_task); size_t state_size; if (BUILTIN_EXPECT(!task, 0)) return -EINVAL; if (BUILTIN_EXPECT(!task->stack, 0)) return -EINVAL; #ifdef CONFIG_X86_32 state_size = sizeof(struct state) - 2*sizeof(size_t); #else state_size = sizeof(struct state); #endif // copy kernel stack of the current task mb(); memcpy(task->stack, curr_task->stack, KERNEL_STACK_SIZE); #ifdef CONFIG_X86_32 size_t esp; asm volatile ("mov %%esp, %0" : "=m"(esp)); esp -= (size_t) curr_task->stack; esp += (size_t) task->stack; state = (struct state*) (esp - state_size); //memset(state, 0x00, state_size); asm volatile ("pusha; pop %0" : "=m"(state->edi)); asm volatile ("pop %0" : "=m"(state->esi)); asm volatile ("pop %0" : "=m"(state->ebp)); asm volatile ("add $4, %%esp" ::: "%esp"); asm volatile ("pop %0" : "=m"(state->ebx)); asm volatile ("pop %0" : "=m"(state->edx)); asm volatile ("pop %0" : "=m"(state->ecx)); asm volatile ("pop %0" : "=m"(state->eax)); state->esp = esp; task->last_stack_pointer = (size_t*) state; state->int_no = 0xB16B00B5; state->error = 0xC03DB4B3; state->cs = 0x08; state->ds = state->es = 0x10; // store the current EFLAGS asm volatile ("pushf; pop %0" : "=m"(state->eflags)); // enable interrupts state->eflags |= (1 << 9); // This will be the entry point for the new task. read_ip cleanups the stack asm volatile ("push %0; call read_ip" :: "r"(&state->eip) : "%eax"); #else size_t rsp; asm volatile ("mov %%rsp, %0" : "=m"(rsp)); rsp -= (size_t) curr_task->stack; rsp += (size_t) task->stack; state = (struct state*) (rsp - state_size); //memset(state, 0x00, state_size); asm volatile ("push %rax"); asm volatile ("push %rcx"); asm volatile ("push %rdx"); asm volatile ("push %rbx"); asm volatile ("push %rbp"); asm volatile ("push %rsi"); asm volatile ("push %rdi"); asm volatile ("push %r8"); asm volatile ("push %r9"); asm volatile ("push %r10"); asm volatile ("push %r11"); asm volatile ("pop %0" : "=m"(state->r11)); asm volatile ("pop %0" : "=m"(state->r10)); asm volatile ("pop %0" : "=m"(state->r9)); asm volatile ("pop %0" : "=m"(state->r8)); asm volatile ("pop %0" : "=m"(state->rdi)); asm volatile ("pop %0" : "=m"(state->rsi)); asm volatile ("pop %0" : "=m"(state->rbp)); asm volatile ("pop %0" : "=m"(state->rbx)); asm volatile ("pop %0" : "=m"(state->rdx)); asm volatile ("pop %0" : "=m"(state->rcx)); asm volatile ("pop %0" : "=m"(state->rax)); state->rsp = rsp; task->last_stack_pointer = (size_t*) state; state->int_no = 0xB16B00B5; state->error = 0xC03DB4B3; state->cs = 0x08; state->ss = 0x10; asm volatile ("pushf; pop %0" : "=m"(state->rflags)); // store the current RFLAGS asm volatile ("leaq (%%rip), %0;": "=r"(state->rip)); // store current instruction pointer state->rflags |= (1 << 9); // enable interrupts #endif return 0; } int create_default_frame(task_t* task, entry_point_t ep, void* arg) { size_t *stack; struct state *stptr; size_t state_size; if (BUILTIN_EXPECT(!task, 0)) return -EINVAL; if (BUILTIN_EXPECT(!task->stack, 0)) return -EINVAL; memset(task->stack, 0xCD, KERNEL_STACK_SIZE); /* The difference between setting up a task for SW-task-switching * and not for HW-task-switching is setting up a stack and not a TSS. * This is the stack which will be activated and popped off for iret later. */ stack = (size_t*) (task->stack + KERNEL_STACK_SIZE - 16); // => stack is 16byte aligned /* The next three things on the stack are a marker for debugging purposes, ... */ *stack-- = 0xDEADBEEF; #ifdef CONFIG_X86_32 /* the first-function-to-be-called's arguments, ... */ *stack-- = (size_t) arg; #endif /* and the "caller" we shall return to. * This procedure cleans the task after exit. */ *stack = (size_t) leave_kernel_task; /* Next bunch on the stack is the initial register state. * The stack must look like the stack of a task which was * scheduled away previously. */ /* In 64bit mode, he stack pointer (SS:RSP) is pushed unconditionally on interrupts. * In legacy modes, this push is conditional and based on a change in current privilege level (CPL).*/ #ifdef CONFIG_X86_32 state_size = sizeof(struct state) - 2*sizeof(size_t); #else state_size = sizeof(struct state); #endif stack = (size_t*) ((size_t) stack - state_size); stptr = (struct state *) stack; memset(stptr, 0x00, state_size); #ifdef CONFIG_X86_32 stptr->esp = (size_t)stack + state_size; #else stptr->rsp = (size_t)stack + state_size; /* the first-function-to-be-called's arguments, ... */ stptr->rdi = (size_t) arg; #endif stptr->int_no = 0xB16B00B5; stptr->error = 0xC03DB4B3; /* The instruction pointer shall be set on the first function to be called * after IRETing */ #ifdef CONFIG_X86_32 stptr->eip = (size_t)ep; #else stptr->rip = (size_t)ep; #endif stptr->cs = 0x08; #ifdef CONFIG_X86_32 stptr->eflags = 0x1202; stptr->ds = stptr->es = 0x10; // the creation of a kernel tasks didn't change the IOPL level // => useresp & ss is not required #else stptr->rflags = 0x1202; stptr->ss = 0x10; stptr->userrsp = stptr->rsp; #endif /* Set the task's stack pointer entry to the stack we have crafted right now. */ task->last_stack_pointer = (size_t*)stack; return 0; } /* Setup a descriptor in the Global Descriptor Table */ static void gdt_set_gate(int num, unsigned long base, unsigned long limit, unsigned char access, unsigned char gran) { configure_gdt_entry(&gdt[num], base, limit, access, gran); } void configure_gdt_entry(gdt_entry_t *dest_entry, unsigned long base, unsigned long limit, unsigned char access, unsigned char gran) { /* Setup the descriptor base address */ dest_entry->base_low = (base & 0xFFFF); dest_entry->base_middle = (base >> 16) & 0xFF; dest_entry->base_high = (base >> 24) & 0xFF; /* Setup the descriptor limits */ dest_entry->limit_low = (limit & 0xFFFF); dest_entry->granularity = ((limit >> 16) & 0x0F); /* Finally, set up the granularity and access flags */ dest_entry->granularity |= (gran & 0xF0); dest_entry->access = access; } /* * This will setup the special GDT * pointer, set up the entries in our GDT, and then * finally call gdt_flush() in our assembler file in order * to tell the processor where the new GDT is and update the * new segment registers */ void gdt_install(void) { unsigned int i; unsigned long mode, limit; memset(task_state_segments, 0x00, MAX_CORES*sizeof(tss_t)); #ifdef CONFIG_X86_32 mode = GDT_FLAG_32_BIT; limit = 0xFFFFFFFF; #elif defined(CONFIG_X86_64) mode = GDT_FLAG_64_BIT; limit = 0; #else #error invalid mode #endif /* Setup the GDT pointer and limit */ gp.limit = (sizeof(gdt_entry_t) * GDT_ENTRIES) - 1; gp.base = (size_t) &gdt; /* Our NULL descriptor */ gdt_set_gate(0, 0, 0, 0, 0); /* * The second entry is our Code Segment. The base address * is 0, the limit is 4 GByte, it uses 4KByte granularity, * uses 32-bit opcodes, and is a Code Segment descriptor. */ gdt_set_gate(1, 0, limit, GDT_FLAG_RING0 | GDT_FLAG_SEGMENT | GDT_FLAG_CODESEG | GDT_FLAG_PRESENT, GDT_FLAG_4K_GRAN | mode); /* * The third entry is our Data Segment. It's EXACTLY the * same as our code segment, but the descriptor type in * this entry's access byte says it's a Data Segment */ gdt_set_gate(2, 0, limit, GDT_FLAG_RING0 | GDT_FLAG_SEGMENT | GDT_FLAG_DATASEG | GDT_FLAG_PRESENT, GDT_FLAG_4K_GRAN | mode); /* * Create code segement for userspace applications (ring 3) */ gdt_set_gate(3, 0, limit, GDT_FLAG_RING3 | GDT_FLAG_SEGMENT | GDT_FLAG_CODESEG | GDT_FLAG_PRESENT, GDT_FLAG_4K_GRAN | mode); /* * Create data segement for userspace applications (ring 3) */ gdt_set_gate(4, 0, limit, GDT_FLAG_RING3 | GDT_FLAG_SEGMENT | GDT_FLAG_DATASEG | GDT_FLAG_PRESENT, GDT_FLAG_4K_GRAN | mode); /* * Create TSS for each task at ring0 (we use these segments for task switching) */ for(i=0; i