/* * Copyright 2012 Stefan Lankes, Chair for Operating Systems, * RWTH Aachen University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This file is part of MetalSVM. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual Memory Layout of the standard configuration * (1 GB kernel space) * * 0x000000000000 - 0x0000000FFFFF: reserved for IO devices (16MB) * 0x000000100000 - 0x00000DEADFFF: Kernel (size depends on the configuration) (221MB) * 0x00000DEAE000 - 0x00003FFFFFFF: Kernel heap * 0xFF4000000000 - 0xFF7FFFFFFFFF: Paging structures for copying a page map (max 512GB) * 0xFF8000000000 - 0xFFFFFFFFFFFF: Paging structures are mapped in this region (max 512GB) */ /// Boot task's page map extern page_map_t boot_pml4; /// Kernel space page map lock static spinlock_t kslock = SPINLOCK_INIT; page_map_t* get_boot_page_map(void) { return &boot_pml4; } // TODO size_t virt_to_phys(size_t viraddr) { task_t* task = per_core(current_task); spinlock_irqsave_lock(&task->page_lock); size_t* entry = (size_t*) (PAGE_MAP_PGT | (viraddr >> 9)); size_t phyaddr = (*entry & ~PAGE_FLAGS_MASK) | (viraddr & ~PAGE_MASK); spinlock_irqsave_unlock(&task->page_lock); return phyaddr; } /** @brief Update page table bits (PG_*) by using arch independent flags (MAP_*) */ static inline size_t page_bits(int flags) { size_t bits = PG_PRESENT|PG_RW|PG_GLOBAL|PG_XD; if (flags & MAP_NO_ACCESS) bits &= ~PG_PRESENT; if (flags & MAP_READ_ONLY) bits &= ~PG_RW; if (flags & MAP_CODE) bits &= ~PG_XD; if (flags & MAP_USER_SPACE) bits &= ~PG_GLOBAL; if (flags & MAP_USER_SPACE) bits |= PG_USER; if (flags & MAP_WT) bits |= PG_PWT; if (flags & MAP_NO_CACHE) bits |= PG_PCD; if (flags & MAP_MPE) bits |= PG_MPE; if (flags & MAP_SVM_INIT) bits |= PG_SVM_INIT; if (flags & MAP_SVM_LAZYRELEASE) bits |= PG_SVM_LAZYRELEASE; if (flags & MAP_SVM_STRONG) bits |= PG_SVM_STRONG; return bits; } /** @brief Recursive traversal through the page map tree * * @param start The first address whose page map entry we will call on * @param end The exclusive end address whose page map entry we will call on * @param pre Callback which is called for every page map entry (pre-order traversal) * @param post Callback which is called for every page map entry (post-order traversal) */ int page_iterate(size_t start, size_t end, page_cb_t pre, page_cb_t post) { page_entry_t* entry[PAGE_MAP_LEVELS]; page_entry_t* last[PAGE_MAP_LEVELS]; if (BUILTIN_EXPECT(start >= end, 0)) return -EINVAL; // setup subtree boundaries int i; for (i=0; ipage_lock); kprintf("%-18s-%18s %14s %-6s\n", "start", "end", "size", "flags"); // header page_iterate(from, to, cb, NULL); // unlock tables spinlock_unlock(&kslock); spinlock_irqsave_unlock(&task->page_lock); // workaround to print last mapping if (flags) print(start, PAGE_FLOOR(to), flags); } void page_stats(size_t from, size_t to, int reset) { task_t* task = per_core(current_task); int i, stats[13] = { 0 }; const char* labels[] = { [0] = "present", "writable", "user accessable", "write through", "cache disabled", // IA-32 "legacy" bits "accessed", "dirty", "huge pages", "global", "svm", "svm lazy", "svm init", [12] = "exec disabled" // IA-32e / PAE bits }; int cb(page_entry_t* entry, int level) { if (*entry & PG_PRESENT) { if (!level || (*entry & PG_PSE)) { // increment stat counters int i; for (i=0; i<12; i++) { // IA-32 "legacy" bits if (*entry & (1 << i)) stats[i]++; } for (i=0; i<1; i++) { // IA-32e / PAE bits if (*entry & (1 << 63-i)) stats[i+PAGE_BITS]++; } } // reset accessed and dirty bits if (reset) { *entry &= ~(PG_ACCESSED|PG_DIRTY); tlb_flush_one_page(entry_to_virt(entry, level)); // see IA32 Vol3 4.8 } } return 0; } // lock tables spinlock_lock(&kslock); spinlock_irqsave_lock(&task->page_lock); page_iterate(from, to, cb, NULL); // unlock tables spinlock_unlock(&kslock); spinlock_irqsave_unlock(&task->page_lock); kprintf("total pages:\n"); for (i=0; i<13; i++) kprintf(" - %s:%*lu\n", labels[i], 25-strlen(labels[i]), stats[i]); } int copy_page_map(task_t* new_task, int copy) { task_t* cur_task = per_core(current_task); size_t phyaddr; uint32_t ret; int cb(page_entry_t* src, int level) { page_entry_t* dest = src - (1L<<36); // TODO if (*src & PG_PRESENT) { if (*src & PG_USER) { kprintf("cb: src=%p, dest=%p, *src=%#lx, level=%u ", src, dest, *src, level); // TODO: remove if (level) { // deep copy user table kputs("deep copy\n"); size_t phyaddr = get_page(); if (BUILTIN_EXPECT(!phyaddr, 0)) return -ENOMEM; atomic_int32_inc(&cur_task->user_usage); *dest = phyaddr | (*src & ~PAGE_FLAGS_MASK); // TODO: copy_page? // TODO: memset(*dest, 0)? } else if (copy) { // deep copy page frame kputs("deep copy frame\n"); size_t phyaddr = get_page(); if (BUILTIN_EXPECT(!phyaddr, 0)) return -ENOMEM; atomic_int32_inc(&cur_task->user_usage); copy_page(phyaddr, *src & ~PAGE_FLAGS_MASK); *dest = phyaddr | (*src & PAGE_FLAGS_MASK); } else kputs("???\n"); } else // shallow copy kernel table *dest = *src; } tlb_flush(); // ouch :( return 0; } // fixed mapping for paging structures page_map_t *current = (page_map_t*) PAGE_MAP_PML4; page_map_t *new = palloc(PAGE_SIZE, 0); if (BUILTIN_EXPECT(!new, 0)) return -ENOMEM; phyaddr = virt_to_phys(new); // lock tables spinlock_lock(&kslock); spinlock_irqsave_lock(&cur_task->page_lock); // map new table current->entries[PAGE_MAP_ENTRIES-2] = phyaddr | PG_TABLE; tlb_flush(); // ouch :( // setup self reference for new table new->entries[PAGE_MAP_ENTRIES-1] = phyaddr | PG_TABLE; ret = page_iterate(0, PAGE_MAP_PGT - (1L<<39), cb, NULL); // TODO: check boundaries // unlock tables spinlock_irqsave_unlock(&cur_task->page_lock); spinlock_unlock(&kslock); // unmap new tables current->entries[PAGE_MAP_ENTRIES-2] = 0; tlb_flush(); // ouch :( new_task->page_map = new; kprintf("copy_page_map: allocated %i page tables\n", ret); // TODO: remove return ret; } int drop_page_map(void) { task_t* task = per_core(current_task); int cb(page_entry_t* entry, int level) { if (*entry & PG_USER) { kprintf("drop_page_map:cb: entry = %p, level = %u\n", entry, level); // TODO: remove if (put_page(*entry & ~PAGE_FLAGS_MASK)) atomic_int32_dec(&task->user_usage); } return 0; } kprintf("drop_page_map: task = %u\n", task->id); // TODO: remove // check assertions if (BUILTIN_EXPECT(task->page_map == get_boot_page_map(), 0)) return -EINVAL; if (BUILTIN_EXPECT(!task || !task->page_map, 0)) return -EINVAL; // lock tables spinlock_irqsave_lock(&task->page_lock); int ret = page_iterate(0, PAGE_MAP_PGT, NULL, cb); // TODO: check boundaries pfree(task->page_map, PAGE_SIZE); // unlock tables spinlock_irqsave_unlock(&task->page_lock); kprintf("drop_page_map: finished\n"); // TODO: remove return 0; } static int set_page_flags(size_t viraddr, uint32_t npages, int flags) { task_t* task = per_core(current_task); size_t bits = page_bits(flags); size_t start = viraddr; size_t end = start + npages * PAGE_SIZE; int cb(page_entry_t* entry, int level) { if (level) { if (flags & MAP_USER_SPACE) *entry |= PG_USER; } else *entry = (*entry & ~PAGE_FLAGS_MASK) | bits; tlb_flush_one_page(entry_to_virt(entry, level)); return 0; } // check assertions if (BUILTIN_EXPECT(start < KERNEL_SPACE && end >= KERNEL_SPACE, 0)) return 0; if (BUILTIN_EXPECT(!task || !task->page_map, 0)) return 0; // lock tables if (viraddr < KERNEL_SPACE) spinlock_lock(&kslock); else spinlock_irqsave_lock(&task->page_lock); int ret = page_iterate(start, end, cb, NULL); // unlock tables if (viraddr < KERNEL_SPACE) spinlock_lock(&kslock); else spinlock_irqsave_lock(&task->page_lock); return ret; } size_t map_region(size_t viraddr, size_t phyaddr, uint32_t npages, uint32_t flags) { task_t* task = per_core(current_task); if (!viraddr) { int vma_flags = VMA_HEAP; if (flags & MAP_USER_SPACE) vma_flags |= VMA_USER; viraddr = vma_alloc(npages * PAGE_SIZE, vma_flags); } size_t bits = page_bits(flags); size_t start = viraddr; size_t end = start + npages * PAGE_SIZE; int cb(page_entry_t* entry, int level) { if (level) { // PGD, PDPT, PML4.. if (*entry & PG_PRESENT) { if (flags & MAP_USER_SPACE) { /* * We are changing page map entries which cover * the kernel. So before altering them we need to * make a private copy for the task */ if (!(*entry & PG_USER)) { size_t phyaddr = get_page(); if (BUILTIN_EXPECT(!phyaddr, 0)) return -ENOMEM; atomic_int32_inc(&task->user_usage); copy_page(phyaddr, *entry & ~PAGE_FLAGS_MASK); *entry = phyaddr | (*entry & PAGE_FLAGS_MASK) | PG_USER; /* * We just need to flush the table itself. * TLB entries for the kernel remain valid * because we've not changed them. */ tlb_flush_one_page(entry_to_virt(entry, 0)); } } } else { size_t phyaddr = get_page(); if (BUILTIN_EXPECT(!phyaddr, 0)) return -ENOMEM; atomic_int32_inc(&task->user_usage); *entry = phyaddr | bits; } } else { // PGT if ((*entry & PG_PRESENT) && !(flags & MAP_REMAP)) return -EINVAL; *entry = phyaddr | bits; if (flags & MAP_USER_SPACE) atomic_int32_inc(&task->user_usage); if (flags & MAP_REMAP) tlb_flush_one_page(entry_to_virt(entry, level)); phyaddr += PAGE_SIZE; } return 0; } kprintf("map_region: map %u pages from %#lx to %#lx with flags: %#x\n", npages, viraddr, phyaddr, flags); // TODO: remove // check assertions if (BUILTIN_EXPECT(start < KERNEL_SPACE && end >= KERNEL_SPACE, 0)) return 0; if (BUILTIN_EXPECT(!task || !task->page_map, 0)) return 0; if (BUILTIN_EXPECT(!viraddr, 0)) return 0; // lock tables if (viraddr < KERNEL_SPACE) spinlock_lock(&kslock); else spinlock_irqsave_lock(&task->page_lock); int ret = page_iterate(start, end, cb, NULL); // unlock tables if (viraddr < KERNEL_SPACE) spinlock_unlock(&kslock); else spinlock_irqsave_unlock(&task->page_lock); return (ret == 0) ? viraddr : 0; } int unmap_region(size_t viraddr, uint32_t npages) { task_t* task = per_core(current_task); size_t start = viraddr; size_t end = start + npages * PAGE_SIZE; kprintf("unmap_region: unmap %u pages from %#lx\n", npages, viraddr); // TODO: remove int cb(page_entry_t* entry, int level) { if (level) { // PGD, PDPT, PML4 page_map_t* map = (page_map_t*) entry_to_virt(entry, 0); int used = 0; int i; for (i=0; ientries[i] & PG_PRESENT) used++; } if (!used) { *entry &= ~PG_PRESENT; tlb_flush_one_page(entry_to_virt(entry, 0)); if (put_page(*entry & ~PAGE_FLAGS_MASK)) atomic_int32_dec(&task->user_usage); } } else { // PGT *entry = 0; tlb_flush_one_page(entry_to_virt(entry, level)); if (viraddr >= KERNEL_SPACE) atomic_int32_dec(&task->user_usage); } return 0; } // check assertions if (BUILTIN_EXPECT(start < KERNEL_SPACE && end >= KERNEL_SPACE, 0)) return 0; if (BUILTIN_EXPECT(!task || !task->page_map, 0)) return 0; // lock tables if (viraddr < KERNEL_SPACE) spinlock_lock(&kslock); else spinlock_irqsave_lock(&task->page_lock); int ret = page_iterate(start, end, NULL, cb); // unlock tables if (viraddr < KERNEL_SPACE) spinlock_unlock(&kslock); else spinlock_irqsave_unlock(&task->page_lock); return ret; } static void pagefault_handler(struct state *s) { task_t* task = per_core(current_task); size_t viraddr = read_cr2(); // on demand userspace heap mapping if ((task->heap) && (viraddr >= task->heap->start) && (viraddr < task->heap->end)) { viraddr &= PAGE_MASK; size_t phyaddr = get_page(); if (BUILTIN_EXPECT(!phyaddr, 0)) { kprintf("out of memory: task = %u\n", task->id); goto default_handler; } viraddr = map_region(viraddr, phyaddr, 1, MAP_USER_SPACE); if (BUILTIN_EXPECT(!viraddr, 0)) { kprintf("map_region: could not map %#lx to %#lx, task = %u\n", viraddr, phyaddr, task->id); put_page(phyaddr); goto default_handler; } memset((void*) viraddr, 0x00, PAGE_SIZE); // fill with zeros return; } default_handler: kprintf("Page Fault Exception (%d) at cs:rip = %#x:%#lx, core = %u, task = %u, addr = %#lx, error = %#x [ %s %s %s %s %s ]\n" "Register state: rflags = %#lx, rax = %#lx, rbx = %#lx, rcx = %#lx, rdx = %#lx, rdi = %#lx, rsi = %#lx, rbp = %#llx, rsp = %#lx\n", s->int_no, s->cs, s->rip, CORE_ID, task->id, viraddr, s->error, (s->error & 0x4) ? "user" : "supervisor", (s->error & 0x10) ? "instruction" : "data", (s->error & 0x2) ? "write" : ((s->error & 0x10) ? "fetch" : "read"), (s->error & 0x1) ? "protection" : "not present", (s->error & 0x8) ? "reserved bit" : "\b", s->rflags, s->rax, s->rbx, s->rcx, s->rdx, s->rdi, s->rsi, s->rbp, s->rsp); irq_enable(); abort(); } int arch_paging_init(void) { uint32_t i, npages; // replace default pagefault handler irq_uninstall_handler(14); irq_install_handler(14, pagefault_handler); // enable PAE and global pages for kernel space (see IA32 Vol3 4.10.2.4) write_cr4(read_cr4() | CR4_PGE | CR4_PAE); // enable execution disable bit (see IA32 Vol3 4.6) wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_NXE); // setup recursive paging boot_pml4.entries[PAGE_MAP_ENTRIES-1] = (size_t) &boot_pml4 | PG_TABLE; /* * In longmode the kernel is already maped into the kernel space (see entry64.asm) * this includes .data, .bss, .text, VGA, the multiboot & multiprocessing (APIC) structures */ #if MAX_CORES > 1 // reserve page for smp boot code if (!map_region(SMP_SETUP_ADDR, SMP_SETUP_ADDR, 1, MAP_NO_CACHE | MAP_REMAP)) { kputs("could not reserve page for smp boot code\n"); return -ENOMEM; } #endif #ifdef CONFIG_MULTIBOOT #if 0 // map reserved memory regions into the kernel space if (mb_info && (mb_info->flags & MULTIBOOT_INFO_MEM_MAP)) { multiboot_memory_map_t* mmap = (multiboot_memory_map_t*) mb_info->mmap_addr; multiboot_memory_map_t* mmap_end = (void*) ((size_t) mb_info->mmap_addr + mb_info->mmap_length); while (mmap < mmap_end) { if (mmap->type != MULTIBOOT_MEMORY_AVAILABLE) { npages = mmap->len / PAGE_SIZE; if ((mmap->addr+mmap->len) % PAGE_SIZE) npages++; map_region(mmap->addr, mmap->addr, npages, MAP_NO_CACHE | MAP_REMAP); } mmap++; } } #endif /* * Modules like the init ram disk are already loaded. * Therefore, we map these modules into the kernel space. */ if (mb_info && (mb_info->flags & MULTIBOOT_INFO_MODS)) { multiboot_module_t* mmodule = (multiboot_module_t*) ((size_t) mb_info->mods_addr); npages = PAGE_FLOOR(mb_info->mods_count*sizeof(multiboot_module_t)) >> PAGE_BITS; map_region((size_t) mmodule, (size_t) mmodule, npages, MAP_REMAP); for(i=0; imods_count; i++, mmodule++) { // map physical address to the same virtual address npages = PAGE_FLOOR(mmodule->mod_end - mmodule->mod_start) >> PAGE_BITS; kprintf("Map module %s at %#x (%u pages)\n", (char*)(size_t) mmodule->cmdline, mmodule->mod_start, npages); map_region((size_t) (mmodule->mod_start), (size_t) (mmodule->mod_start), npages, MAP_REMAP); } } #endif // we turned on paging => now, we are able to register our task register_task(); // APIC registers into the kernel address space map_apic(); return 0; }