/* * Copyright 2012 Stefan Lankes, Chair for Operating Systems, * RWTH Aachen University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This file is part of MetalSVM. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual Memory Layout of the standard configuration * (1 GB kernel space) * * 0x000000000000 - 0x0000000FFFFF: reserved for IO devices (16MB) * 0x000000100000 - 0x00000DEADFFF: Kernel (size depends on the configuration) (221MB) * 0x00000DEAE000 - 0x00003FFFFFFF: Kernel heap * 0xFF8000000000 - 0xFFFFFFFFFFFF: Paging structures are mapped in this region (max 512GB) */ /* * Note that linker symbols are not variables, they have no memory allocated for * maintaining a value, rather their address is their value. */ extern const void kernel_start; extern const void kernel_end; // boot task's page map and page map lock extern page_map_t boot_pml4; static spinlock_t kslock = SPINLOCK_INIT; page_map_t* get_boot_page_map(void) { return &boot_pml4; } /** @brief Copy a single page frame * * @param src virtual address of source page frame * @return physical addr to copied page frame */ static size_t copy_page_frame(size_t *src) { kprintf("copy_page_frame(%p)\n", src); #if 1 // TODO: untested size_t phyaddr, viraddr; // allocate and map an empty page phyaddr = get_page(); if (BUILTIN_EXPECT(!phyaddr, 0)) return 0; viraddr = vma_alloc(PAGE_SIZE, VMA_HEAP); if (BUILTIN_EXPECT(!viraddr, 0)) return 0; viraddr = map_region(viraddr, phyaddr, 1, MAP_KERNEL_SPACE); if (BUILTIN_EXPECT(!viraddr, 0)) return 0; // copy the whole page strncpy((void*) viraddr, (void*) src, PAGE_SIZE); // unmap and free page unmap_region(viraddr, 1); vma_free(viraddr, viraddr+PAGE_SIZE); return phyaddr; #else kprintf("TODO: copy_page_frame(%lx)\n", source); return 0; #endif } static inline size_t canonicalize(size_t addr) { if (addr & (1UL<<47)) return addr; else return addr & ((1UL<<48) - 1); } static inline int map_to_level(size_t addr) { if (addr >= PAGE_PML4) return 4; else if (addr >= PAGE_PDPT) return 3; else if (addr >= PAGE_PGD) return 2; else if (addr >= PAGE_PGT) return 1; else return -EINVAL; } static inline const char * map_to_lvlname(size_t addr) { const char* names[] = {"(none)", "PGT", "PGD", "PDPT", "PML4"}; return names[map_to_level(addr)]; } static inline size_t map_to_virt(size_t addr) { return canonicalize(addr << (map_to_level(addr) * PAGE_MAP_SHIFT)); } /* * Copy page maps using recursion * * @param from pointer to virtual address of source page tables * @param to pointer to virtual address of destination page tables * @param copy flags what should be copied (see #define COPY_*) * @return number of new allocated page frames (for tables only) */ static int copy_page_map(page_map_t *src, page_map_t *dest, int copy) { page_map_t* next_src, * next_dest; int ret = 0; uint32_t i; for(i=0; ientries[i] & PG_PRESENT)) // skip empty entries dest->entries[i] = 0; else if (src->entries[i] & PG_USER) { size_t phys; kprintf("d:%p (%s: 0x%012lx) -> %p\n", &src->entries[i], map_to_lvlname((size_t) &src->entries[i]), map_to_virt((size_t) &src->entries[i]), &dest->entries[i]); // deep copy user tables if ((size_t) src >= PAGE_PGT) { phys = get_page(); if (BUILTIN_EXPECT(!phys, 0)) return -ENOMEM; dest->entries[i] = phys|(src->entries[i] & ~PAGE_MASK); // reuse pointers to next lower page map tables next_src = (page_map_t*) ((size_t) &src->entries[i] << 9); next_dest = (page_map_t*) ((size_t) &dest->entries[i] << 9); ret += 1 + copy_page_map(next_src, next_dest, copy); } // deep copy page frame else { if (copy) { phys = copy_page_frame((size_t*) src->entries[i]); dest->entries[i] = phys|(src->entries[i] & ~PAGE_MASK); } kprintf("c: %p (%lx)\n", &src->entries[i], src->entries[i]); } } // shallow copy kernel only tables else { kprintf("s:%p (%s: 0x%012lx) -> %p\n", &src->entries[i], map_to_lvlname((size_t) &src->entries[i]), map_to_virt((size_t) &src->entries[i]), &dest->entries[i]); dest->entries[i] = src->entries[i]; } } kputs("r\n"); return ret; } int create_page_map(task_t* task, int copy) { size_t phys; uint32_t ret; // fixed mapping for paging structures page_map_t *current = (page_map_t*) PAGE_PML4; page_map_t *new = (page_map_t*) (PAGE_PML4 - 0x1000); // get new pml4 table phys = get_page(); if (!phys) return -ENOMEM; current->entries[PAGE_MAP_ENTRIES-2] = phys|KERN_TABLE; new->entries[PAGE_MAP_ENTRIES-1] = phys|KERN_TABLE; tlb_flush(); // ouch :( spinlock_lock(&kslock); ret = copy_page_map(current, new, copy); spinlock_unlock(&kslock); new->entries[PAGE_MAP_ENTRIES-1] = phys|KERN_TABLE; current->entries[PAGE_MAP_ENTRIES-2] = 0; task->page_map = (page_map_t*) phys; kprintf("create_page_map: allocated %u page tables\n", ret); return ret; } int drop_page_map(void) { #if 1 kprintf("TODO: test drop_page_map()\n"); return -EINVAL; // TODO #else task_t* task = per_core(current_task); page_map_t* pml4, * pdpt, * pgd, * pgt; size_t phys; uint32_t i, j, k, l; pml4 = task->page_map; if (BUILTIN_EXPECT(pml4 == &boot_pml4, 0)) return -EINVAL; spinlock_lock(&task->page_lock); // delete all user pages and tables for(i=0; ientries[i] & PG_USER) { for(j=0; jentries[j] & PG_USER) { for(k=0; kentries[k] & PG_USER) { for(l=0; lentries[l] & PG_USER) put_page(pgt->entries[l] & PAGE_MASK); } // TODO: put pgt } } // TODO: put pgd } } // TODO: put pdpt } } put_page(virt_to_phys((size_t) pml4)); task->page_map = NULL; spinlock_unlock(&task->page_lock); return 0; #endif } size_t virt_to_phys(size_t viraddr) { task_t* task = per_core(current_task); size_t phyaddr; size_t* pte; spinlock_irqsave_lock(&task->page_lock); pte = (size_t *) (PAGE_PGT | (viraddr >> 9)); phyaddr = (*pte & PAGE_MASK) | (viraddr & ~PAGE_MASK); spinlock_irqsave_unlock(&task->page_lock); return phyaddr; } size_t map_region(size_t viraddr, size_t phyaddr, uint32_t npages, uint32_t flags) { task_t* task = per_core(current_task); size_t i, ret; if (BUILTIN_EXPECT(!task || !task->page_map, 0)) return 0; if (!viraddr) { kputs("map_region: deprecated vma_alloc() call from within map_region\n"); viraddr = vma_alloc(npages*PAGE_SIZE, VMA_HEAP); if (BUILTIN_EXPECT(!viraddr, 0)) { kputs("map_region: found no valid virtual address\n"); ret = 0; goto out; } } // correct alignment phyaddr &= PAGE_MASK; viraddr &= PAGE_MASK; ret = viraddr; if (flags & MAP_KERNEL_SPACE) spinlock_lock(&kslock); else spinlock_irqsave_lock(&task->page_lock); kprintf("map_region: map %u pages from 0x%lx to 0x%lx with flags: 0x%x\n", npages, viraddr, phyaddr, flags); for(i=0; i> 9)); if (*pte && !(flags & MAP_REMAP)) { kprintf("map_region: 0x%lx is already mapped\n", viraddr); ret = 0; goto out; } if (flags & MAP_USER_SPACE) *pte = phyaddr|USER_PAGE; else *pte = phyaddr|KERN_PAGE; if (flags & MAP_NO_CACHE) *pte |= PG_PCD; if (flags & MAP_NO_ACCESS) *pte &= ~PG_PRESENT; if (flags & MAP_WT) *pte |= PG_PWT; if (flags & MAP_USER_SPACE) atomic_int32_inc(&task->user_usage); tlb_flush_one_page(viraddr); } out: if (flags & MAP_KERNEL_SPACE) spinlock_unlock(&kslock); else spinlock_irqsave_unlock(&task->page_lock); return ret; } int change_page_permissions(size_t start, size_t end, uint32_t flags) { #if 0 uint32_t index1, index2, newflags; size_t viraddr = start & PAGE_MASK; size_t phyaddr; page_map_t* pgt; page_map_t* pgd; task_t* task = per_core(current_task); pgd = per_core(current_task)->page_map; if (BUILTIN_EXPECT(!pgd, 0)) return -EINVAL; spinlock_lock(&task->page_lock); while (viraddr < end) { index1 = viraddr >> 22; index2 = (viraddr >> 12) & 0x3FF; while ((viraddr < end) && (index2 < 1024)) { pgt = (page_map_t*) (page_map_t*) ((KERNEL_SPACE - 1024*PAGE_SIZE + index1*PAGE_SIZE) & PAGE_MASK); if (pgt && pgt->entries[index2]) { phyaddr = pgt->entries[index2] & PAGE_MASK; newflags = pgt->entries[index2] & 0xFFF; // get old flags if (!(newflags & PG_SVM_INIT)) { if ((newflags & PG_SVM_STRONG) && !(newflags & PG_PRESENT) && (flags & (VMA_READ|VMA_WRITE) && !(flags & VMA_NOACCESS))) newflags |= PG_PRESENT; else if ((newflags & PG_SVM_STRONG) && (newflags & PG_PRESENT) && (flags & VMA_NOACCESS)) newflags &= ~PG_PRESENT; } // update flags if (!(flags & VMA_WRITE)) { newflags &= ~PG_RW; } else { newflags |= PG_RW; } pgt->entries[index2] = (newflags & 0xFFF) | (phyaddr & PAGE_MASK); tlb_flush_one_page(viraddr); } index2++; viraddr += PAGE_SIZE; } } spinlock_unlock(&task->page_lock); #endif return -EINVAL; } int unmap_region(size_t viraddr, uint32_t npages) { task_t* task = per_core(current_task); page_map_t* pdpt, * pgd, * pgt; size_t i; uint16_t index_pml4, index_pdpt; uint16_t index_pgd, index_pgt; if (BUILTIN_EXPECT(!task || !task->page_map, 0)) return -EINVAL; if (viraddr <= KERNEL_SPACE) spinlock_lock(&kslock); else spinlock_irqsave_lock(&task->page_lock); i = 0; while(i> 39) & 0x1FF; index_pdpt = (viraddr >> 30) & 0x1FF; index_pgd = (viraddr >> 21) & 0x1FF; index_pgt = (viraddr >> 12) & 0x1FF; // currently, we allocate pages only in kernel space. // => physical address of the page table is identical of the virtual address pdpt = (page_map_t*) (task->page_map->entries[index_pml4] & PAGE_MASK); if (!pdpt) { viraddr += (size_t) PAGE_MAP_ENTRIES*PAGE_MAP_ENTRIES*PAGE_MAP_ENTRIES*PAGE_SIZE; i += PAGE_MAP_ENTRIES*PAGE_MAP_ENTRIES*PAGE_MAP_ENTRIES; continue; } pgd = (page_map_t*) (pdpt->entries[index_pdpt] & PAGE_MASK); if (!pgd) { viraddr += PAGE_MAP_ENTRIES*PAGE_MAP_ENTRIES*PAGE_SIZE; i += PAGE_MAP_ENTRIES*PAGE_MAP_ENTRIES; continue; } pgt = (page_map_t*) (pgd->entries[index_pgd] & PAGE_MASK); if (!pgt) { viraddr += PAGE_MAP_ENTRIES*PAGE_SIZE; i += PAGE_MAP_ENTRIES; continue; } if (pgt->entries[index_pgt]) pgt->entries[index_pgt] &= ~PG_PRESENT; viraddr +=PAGE_SIZE; i++; if (viraddr > KERNEL_SPACE) atomic_int32_dec(&task->user_usage); tlb_flush_one_page(viraddr); } if (viraddr <= KERNEL_SPACE) spinlock_unlock(&kslock); else spinlock_irqsave_unlock(&task->page_lock); return 0; } static void pagefault_handler(struct state *s) { task_t* task = per_core(current_task); size_t viraddr = read_cr2(); size_t phyaddr; #if 0 if ((viraddr >= task->start_heap) && (viraddr <= task->end_heap) && (viraddr > KERNEL_SPACE)) { viraddr = viraddr & PAGE_MASK; phyaddr = get_page(); if (BUILTIN_EXPECT(!phyaddr, 0)) goto oom; if (map_region(viraddr, phyaddr, 1, MAP_USER_SPACE) == viraddr) { memset((void*) viraddr, 0x00, PAGE_SIZE); return; } kprintf("Could not map 0x%x at 0x%x\n", phyaddr, viraddr); put_page(phyaddr); } /* * handle missing paging structures for userspace * all kernel space paging structures have been initialized in entry64.asm */ else if (viraddr >= PAGE_PGT) { kprintf("map_region: missing paging structure at: 0x%lx (%s)\n", viraddr, map_to_lvlname(viraddr)); phyaddr = get_page(); if (BUILTIN_EXPECT(!phyaddr, 0)) goto oom; // TODO: initialize with zeros // TODO: check that we are in userspace // get pointer to parent page level entry size_t *entry = (size_t *) ((int64_t) viraddr >> 9 & ~0x07); // update entry *entry = phyaddr|USER_TABLE; return; } #endif kprintf("PAGE FAULT: Task %u got page fault at %p (irq %llu, cs:rip 0x%llx:0x%llx)\n", task->id, viraddr, s->int_no, s->cs, s->rip); kprintf("Register state: rax = 0x%llx, rbx = 0x%llx, rcx = 0x%llx, rdx = 0x%llx, rdi = 0x%llx, rsi = 0x%llx, rbp = 0x%llx, rsp = 0x%llx\n", s->rax, s->rbx, s->rcx, s->rdx, s->rdi, s->rsi, s->rbp, s->rsp); irq_enable(); abort(); oom: kputs("map_region: out of memory\n"); irq_enable(); abort(); } int arch_paging_init(void) { uint32_t i, npages; // replace default pagefault handler irq_uninstall_handler(14); irq_install_handler(14, pagefault_handler); /* * In longmode the kernel is already maped into the kernel space (see entry64.asm) * this includes .data, .bss, .text, VGA, the multiboot & multiprocessing (APIC) structures */ #if MAX_CORES > 1 // reserve page for smp boot code if (!map_region(SMP_SETUP_ADDR, SMP_SETUP_ADDR, 1, MAP_KERNEL_SPACE|MAP_NO_CACHE)) { kputs("could not reserve page for smp boot code\n"); return -ENOMEM; } #endif #ifdef CONFIG_MULTIBOOT #if 0 // map reserved memory regions into the kernel space if (mb_info && (mb_info->flags & MULTIBOOT_INFO_MEM_MAP)) { multiboot_memory_map_t* mmap = (multiboot_memory_map_t*) mb_info->mmap_addr; multiboot_memory_map_t* mmap_end = (void*) ((size_t) mb_info->mmap_addr + mb_info->mmap_length); while (mmap < mmap_end) { if (mmap->type != MULTIBOOT_MEMORY_AVAILABLE) { npages = mmap->len / PAGE_SIZE; if ((mmap->addr+mmap->len) % PAGE_SIZE) npages++; map_region(mmap->addr, mmap->addr, npages, MAP_KERNEL_SPACE|MAP_NO_CACHE); } mmap++; } } #endif /* * Modules like the init ram disk are already loaded. * Therefore, we map these modules into the kernel space. */ if (mb_info && (mb_info->flags & MULTIBOOT_INFO_MODS)) { multiboot_module_t* mmodule = (multiboot_module_t*) ((size_t) mb_info->mods_addr); npages = mb_info->mods_count * sizeof(multiboot_module_t) >> PAGE_SHIFT; if (mb_info->mods_count * sizeof(multiboot_module_t) & (PAGE_SIZE-1)) npages++; map_region((size_t) (mb_info->mods_addr), (size_t) (mb_info->mods_addr), npages, MAP_REMAP|MAP_KERNEL_SPACE); for(i=0; imods_count; i++, mmodule++) { // map physical address to the same virtual address npages = (mmodule->mod_end - mmodule->mod_start) >> PAGE_SHIFT; if (mmodule->mod_end & (PAGE_SIZE-1)) npages++; kprintf("Map module %s at 0x%x (%u pages)\n", (char*) mmodule->cmdline, mmodule->mod_start, npages); map_region((size_t) (mmodule->mod_start), (size_t) (mmodule->mod_start), npages, MAP_REMAP|MAP_KERNEL_SPACE); } } #endif // we turned on paging => now, we are able to register our task register_task(); // APIC registers into the kernel address space map_apic(); return 0; }