/* 
 * Copyright 2010 Stefan Lankes, Chair for Operating Systems,
 *                               RWTH Aachen University
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * This file is part of MetalSVM. 
 */

/**
 * @author Stefan Lankes
 * @file kernel/tasks.c
 * @brief Implementations of task loading, killing, scheduling.
 *
 * This files contains all the implementations of different functions
 * to start tasks with, wake them up, schedule them, etc.
 */

#include <metalsvm/stdio.h>
#include <metalsvm/stdlib.h>
#include <metalsvm/string.h>
#include <metalsvm/errno.h>
#include <metalsvm/mmu.h>
#include <metalsvm/page.h>
#include <metalsvm/tasks.h>
#include <metalsvm/processor.h>
#include <metalsvm/spinlock.h>
#include <metalsvm/mailbox.h>
#include <metalsvm/syscall.h>
#include <metalsvm/fs.h>
#include <metalsvm/time.h>
#include <asm/apic.h>
#include <asm/elf.h>

/** @brief Array of task structures 
 *
 * A task's id will be its position in this array.
 */
static task_t task_table[MAX_TASKS] = { \
		[0]                 = {0, TASK_IDLE,	0, 0, 0, NULL, NULL, 0, ATOMIC_INIT(0), SPINLOCK_INIT, NULL, SPINLOCK_INIT, NULL, NULL, 0, 0, 0, 0}, \
		[1 ... MAX_TASKS-1] = {0, TASK_INVALID, 0, 0, 0, NULL, NULL, 0, ATOMIC_INIT(0), SPINLOCK_INIT, NULL, SPINLOCK_INIT, NULL, NULL, 0, 0, 0, 0}};

static spinlock_irqsave_t table_lock = SPINLOCK_IRQSAVE_INIT;
#if MAX_CORES > 1
static runqueue_t runqueues[MAX_CORES] = { \
		[0]                 = {task_table+0, NULL, 0, {[0 ... 2] = 0}, TIMER_FREQ/5, TIMER_FREQ/2, 0, {[0 ... MAX_PRIO-1] = {NULL, NULL}}, {NULL, NULL}, SPINLOCK_IRQSAVE_INIT}, \
		[1 ... MAX_CORES-1] = {NULL,         NULL, 0, {[0 ... 2] = 0}, TIMER_FREQ/5, TIMER_FREQ/2, 0, {[0 ... MAX_PRIO-1] = {NULL, NULL}}, {NULL, NULL}, SPINLOCK_IRQSAVE_INIT}};
#else
static runqueue_t runqueues[1] = { \
		[0]                 = {task_table+0, NULL, 0, {[0 ... 2] = 0}, TIMER_FREQ/5, TIMER_FREQ/2, 0, {[0 ... MAX_PRIO-1] = {NULL, NULL}}, {NULL, NULL}, SPINLOCK_IRQSAVE_INIT}};
#endif

DEFINE_PER_CORE(task_t*, current_task, task_table+0);

/** @brief helper function for the assembly code to determine the current task
 * @return Pointer to the task_t structure of current task
 */
task_t* get_current_task(void) {
	return per_core(current_task);
}

void check_scheduling(void) {
	if (!is_irq_enabled())
		return;
	if (msb(runqueues[CORE_ID].prio_bitmap) > per_core(current_task)->prio)
		reschedule();
}

uint32_t get_highest_priority(void)
{
	return msb(runqueues[CORE_ID].prio_bitmap);
}

int multitasking_init(void) {
	if (BUILTIN_EXPECT(task_table[0].status != TASK_IDLE, 0)) {
		kputs("Task 0 is not an idle task\n");
		return -ENOMEM;
	}

	mailbox_wait_msg_init(&task_table[0].inbox);
	memset(task_table[0].outbox, 0x00, sizeof(mailbox_wait_msg_t*)*MAX_TASKS);
	task_table[0].pgd = get_boot_pgd();
	task_table[0].flags = TASK_DEFAULT_FLAGS;
	task_table[0].prio = IDLE_PRIO;

	return 0;
}

size_t get_idle_task(uint32_t id)
{
#if MAX_CORES > 1
	if (BUILTIN_EXPECT((id >= MAX_TASKS) || (task_table[id].status != TASK_INVALID), 0))
		return -EINVAL;

	task_table[id].id = id;
	task_table[id].status = TASK_IDLE;
	task_table[id].prio = IDLE_PRIO;
	task_table[id].flags = TASK_DEFAULT_FLAGS;
	task_table[id].last_core = id;
	atomic_int32_set(&task_table[id].user_usage, 0);
	mailbox_wait_msg_init(&task_table[id].inbox);
	memset(task_table[id].outbox, 0x00, sizeof(mailbox_wait_msg_t*)*MAX_TASKS);
	task_table[id].pgd = get_boot_pgd();
	current_task[id].var = task_table+id;
	runqueues[id].idle = task_table+id;

	return get_stack(id);
#else
	return -EINVAL;
#endif
}

static void finish_task_switch(uint32_t irq)
{
	uint8_t prio;
	uint32_t core_id = CORE_ID;
	task_t* old;

	spinlock_irqsave_lock(&runqueues[core_id].lock);
	if ((old = runqueues[core_id].old_task) != NULL) {
		prio = old->prio;
		if (!runqueues[core_id].queue[prio-1].first) {
			old->next = old->prev = NULL;
			runqueues[core_id].queue[prio-1].first = runqueues[core_id].queue[prio-1].last = old;
		} else {
			old->next = NULL;
			old->prev = runqueues[core_id].queue[prio-1].last;
			runqueues[core_id].queue[prio-1].last->next = old;
			runqueues[core_id].queue[prio-1].last = old;
		}
		runqueues[core_id].old_task = NULL;
		runqueues[core_id].prio_bitmap |= (1 << prio);
	}
	spinlock_irqsave_unlock(&runqueues[core_id].lock);

	if (irq)
		irq_enable();
}

/** @brief Wakeup tasks which are waiting for a message from the current one
 *
 * @param result Current task's resulting return value 
 */
static void wakeup_blocked_tasks(int result)
{
	task_t* curr_task = per_core(current_task);
	wait_msg_t tmp = { curr_task->id, result };
	unsigned int i;

	spinlock_irqsave_lock(&table_lock);

	/* wake up blocked tasks */
	for(i=0; i<MAX_TASKS; i++) {
		if (curr_task->outbox[i]) {
			//kprintf("Wake up blocked task %d\n", i);
			mailbox_wait_msg_post(curr_task->outbox[i], tmp);
			curr_task->outbox[i] = NULL;
		}
	}

	spinlock_irqsave_unlock(&table_lock);
}

/** @brief A procedure to be called by 
 * procedures which are called by exiting tasks. */
static void NORETURN do_exit(int arg) {
	vma_t* tmp;
	task_t* curr_task = per_core(current_task);
	uint32_t flags, core_id;

	kprintf("Terminate task: %u, return value %d\n", curr_task->id, arg);

	wakeup_blocked_tasks(arg);

	//vma_dump(curr_task);
	spinlock_lock(&curr_task->vma_lock);

	// remove memory regions
	while((tmp = curr_task->vma_list) != NULL) {
		kfree((void*) tmp->start, tmp->end - tmp->start + 1);
		curr_task->vma_list = tmp->next;
		kfree((void*) tmp, sizeof(vma_t));
	}

	//remove fildes_table
	if(!curr_task->fildes_table)
		kfree(curr_task->fildes_table, sizeof(fildes_t)*NR_OPEN);

	spinlock_unlock(&curr_task->vma_lock);

	drop_pgd(); // delete page directory and its page tables
	
	if (atomic_int32_read(&curr_task->user_usage))
		kprintf("Memory leak! Task %d did not release %d pages\n", 
				curr_task->id, atomic_int32_read(&curr_task->user_usage));
	curr_task->status = TASK_FINISHED;

	// decrease the number of active tasks
	flags = irq_nested_disable();
	core_id = CORE_ID;
	spinlock_irqsave_lock(&runqueues[core_id].lock);
	runqueues[core_id].nr_tasks--;
	spinlock_irqsave_unlock(&runqueues[core_id].lock);
	irq_nested_enable(flags);

	reschedule();
	
	kprintf("Kernel panic: scheduler on core %d found no valid task\n", CORE_ID);
	while(1) {
		HALT;
	}
}

/** @brief A procedure to be called by kernel tasks */
void NORETURN leave_kernel_task(void) {
        int result;

        result = get_return_value();
        do_exit(result);
}

/** @brief To be called by the systemcall to exit tasks */
void NORETURN sys_exit(int arg) {
	do_exit(arg);
}

/** @brief Aborting a task is like exiting it with result -1 */
void NORETURN abort(void) {
	do_exit(-1);
}

/** @brief Create a task with a specific entry point
 *
 * @param id Pointer to a tid_t struct were the id shall be set
 * @param ep Pointer to the function the task shall start with
 * @param arg Arguments list
 * @return
 * - 0 on success
 * - -ENOMEM (-12) or -EINVAL (-22) on failure
 */
static int create_task(tid_t* id, internal_entry_point_t ep, void* arg, uint8_t prio)
{
	task_t* curr_task;
	int ret = -ENOMEM;
	unsigned int i, core_id;

	if (BUILTIN_EXPECT(!ep, 0))
		return -EINVAL;
	if (BUILTIN_EXPECT(prio == IDLE_PRIO, 0))
		return -EINVAL;
	if (BUILTIN_EXPECT(prio > MAX_PRIO, 0))
		return -EINVAL;

	spinlock_irqsave_lock(&table_lock);

	core_id = CORE_ID;
	curr_task = per_core(current_task);

	for(i=0; i<MAX_TASKS; i++) {
		if (task_table[i].status == TASK_INVALID) {
			atomic_int32_set(&task_table[i].user_usage, 0);

			ret = create_pgd(task_table+i, 0);		
			if (ret < 0) {
				ret = -ENOMEM;
				goto create_task_out;
			}

			task_table[i].id = i;
			task_table[i].status = TASK_READY;
			task_table[i].flags = TASK_DEFAULT_FLAGS;
			task_table[i].prio = prio;
			task_table[i].last_core = 0;
			spinlock_init(&task_table[i].vma_lock);
			task_table[i].vma_list = NULL;
			task_table[i].fildes_table = NULL;
			mailbox_wait_msg_init(&task_table[i].inbox);
			memset(task_table[i].outbox, 0x00, sizeof(mailbox_wait_msg_t*)*MAX_TASKS);
			task_table[i].outbox[curr_task->id] = &curr_task->inbox; 

			if (id)
				*id = i;	

			ret = create_default_frame(task_table+i, ep, arg);

			task_table[i].start_heap = 0;
			task_table[i].end_heap = 0;
			task_table[i].lwip_err = 0;
			task_table[i].start_tick = get_clock_tick();

			// add task in the runqueue
			spinlock_irqsave_lock(&runqueues[core_id].lock);
			runqueues[core_id].prio_bitmap |= (1 << prio);
			runqueues[core_id].nr_tasks++;
			if (!runqueues[core_id].queue[prio-1].first) {
				task_table[i].next = task_table[i].prev = NULL;
				runqueues[core_id].queue[prio-1].first = task_table+i;
				runqueues[core_id].queue[prio-1].last = task_table+i;
			} else {
				task_table[i].prev = runqueues[core_id].queue[prio-1].last;
				task_table[i].next = NULL;
				runqueues[core_id].queue[prio-1].last->next = task_table+i;
				runqueues[core_id].queue[prio-1].last = task_table+i;
			}
			spinlock_irqsave_unlock(&runqueues[core_id].lock);
			break;
		}
	}

create_task_out:
	spinlock_irqsave_unlock(&table_lock);

	return ret;
}

int sys_fork(void)
{
	int ret = -ENOMEM;
	unsigned int i, core_id, fd_i;
	task_t* parent_task = per_core(current_task);
	vma_t** child;
	vma_t* parent;
	vma_t* tmp;

	spinlock_lock(&parent_task->vma_lock);
	spinlock_irqsave_lock(&table_lock);

	core_id = CORE_ID;

	for(i=0; i<MAX_TASKS; i++) {
		if (task_table[i].status == TASK_INVALID) {
			atomic_int32_set(&task_table[i].user_usage, 0);

			ret = create_pgd(task_table+i, 1);		
			if (ret < 0) {
				ret = -ENOMEM;
				goto create_task_out;
			}

			task_table[i].id = i;
			spinlock_init(&task_table[i].vma_lock);

			// copy VMA list
			child = &task_table[i].vma_list;
			parent = parent_task->vma_list;
			tmp = NULL;

			while(parent) {
				*child = (vma_t*) kmalloc(sizeof(vma_t));
				if (BUILTIN_EXPECT(!child, 0))
					break;

				(*child)->start = parent->start;
				(*child)->end = parent->end;
				(*child)->type = parent->type;
				(*child)->prev = tmp;
				(*child)->next = NULL;

				parent = parent->next;
				tmp = *child;
				child = &((*child)->next);
			}
			

			/* init fildes_table */
			task_table[i].fildes_table = kmalloc(sizeof(filp_t)*NR_OPEN);
			memcpy(task_table[i].fildes_table, parent_task->fildes_table, sizeof(filp_t)*NR_OPEN);
			for (fd_i = 0; fd_i < NR_OPEN; fd_i++)
				if ((task_table[i].fildes_table[fd_i]) != NULL)
					task_table[i].fildes_table[fd_i]->count++;

			mailbox_wait_msg_init(&task_table[i].inbox);
			memset(task_table[i].outbox, 0x00, sizeof(mailbox_wait_msg_t*)*MAX_TASKS);
			task_table[i].outbox[parent_task->id] = &parent_task->inbox; 
			task_table[i].flags = parent_task->flags;
			memcpy(&(task_table[i].fpu), &(parent_task->fpu), sizeof(union fpu_state));
			task_table[i].start_tick = get_clock_tick();
			task_table[i].start_heap = 0;
			task_table[i].end_heap = 0;
			task_table[i].lwip_err = 0;
			task_table[i].prio = parent_task->prio;
			task_table[i].last_core = parent_task->last_core;

			// add task in the runqueue
			spinlock_irqsave_lock(&runqueues[core_id].lock);
			runqueues[core_id].prio_bitmap |= (1 << parent_task->prio);
			runqueues[core_id].nr_tasks++;
			if (!runqueues[core_id].queue[parent_task->prio-1].first) {
				task_table[i].next = task_table[i].prev = NULL;
				runqueues[core_id].queue[parent_task->prio-1].first = task_table+i;
				runqueues[core_id].queue[parent_task->prio-1].last = task_table+i;
			} else {
				task_table[i].prev = runqueues[core_id].queue[parent_task->prio-1].last;
				task_table[i].next = NULL;
				runqueues[core_id].queue[parent_task->prio-1].last->next = task_table+i;
				runqueues[core_id].queue[parent_task->prio-1].last = task_table+i;
			}
			spinlock_irqsave_unlock(&runqueues[core_id].lock);

			ret = arch_fork(task_table+i);

			if (parent_task != per_core(current_task)) {
				// Oh, the current task is the new child task!
				// Leave the function without releasing the locks
				// because the locks are already released 
				// by the parent task!
				finish_task_switch(1);
				return 0; 
			}

			if (!ret) {
				task_table[i].status = TASK_READY;
				ret = i;
			}
			break;
		}
	}

create_task_out:
	spinlock_irqsave_unlock(&table_lock);
	spinlock_unlock(&parent_task->vma_lock);

	return ret;
}

/** @brief Structure which keeps all
 * relevant data for a new kernel task to start */
typedef struct {
	/// entry point of the kernel task
	entry_point_t func;
	/// arguments
	void* args;
} kernel_args_t;

/** @brief This call is used to adapt create_task calls
 * which want to have a start function and argument list */
static int STDCALL kernel_entry(void* args)
{
	int ret;
	kernel_args_t* kernel_args = (kernel_args_t*) args;

	finish_task_switch(1);

	if (BUILTIN_EXPECT(!kernel_args, 0))
		return -EINVAL;

	ret = kernel_args->func(kernel_args->args);

	kfree(kernel_args, sizeof(kernel_args_t));

	return ret;
}

int create_kernel_task(tid_t* id, entry_point_t ep, void* args, uint8_t prio)
{
	kernel_args_t* kernel_args;

	kernel_args = kmalloc(sizeof(kernel_args_t));
	if (BUILTIN_EXPECT(!kernel_args, 0))
		return -ENOMEM;

	kernel_args->func = ep;
	kernel_args->args = args;

	if (prio > MAX_PRIO)
		prio = NORMAL_PRIO;

	return create_task(id, kernel_entry, kernel_args, prio);
}

#define MAX_ARGS	(PAGE_SIZE - 2*sizeof(int) - sizeof(vfs_node_t*))

/** @brief Structure which keeps all 
 * relevant data for a new user task to start */
typedef struct {
	/// Points to the node with the executable in the file system
	vfs_node_t* node;
	/// Argument count
	int argc;
	/// Environment var count
	int envc;
	/// Buffer for env and argv values
	char buffer[MAX_ARGS];
} load_args_t;

/** @brief Internally used function to load tasks with a load_args_t structure
 * keeping all the information needed to launch.
 *
 * This is where the serious loading action is done.
 */
static int load_task(load_args_t* largs)
{
	uint32_t i, offset, idx, fd_i;
	uint32_t addr, npages, flags, stack = 0;
	elf_header_t header;
	elf_program_header_t prog_header;
	//elf_section_header_t sec_header;
	///!!! kfree is missing!
	fildes_t *file = kmalloc(sizeof(fildes_t));
	file->offset = 0;
	file->flags = 0;

	//TODO: init the hole fildes_t struct!
	task_t* curr_task = per_core(current_task);
	int err;

	if (!largs)
		return -EINVAL;

	file->node = largs->node;
	if (!file->node)
		return -EINVAL;

	/* init fildes_table */
	spinlock_irqsave_lock(&table_lock);
	if (!task_table[curr_task->id].fildes_table) {
		task_table[curr_task->id].fildes_table = kmalloc(sizeof(filp_t)*NR_OPEN);
		if (BUILTIN_EXPECT(!task_table[curr_task->id].fildes_table, 0)) {
			spinlock_irqsave_unlock(&table_lock);
			return -ENOMEM;
		}
		memset(task_table[curr_task->id].fildes_table, 0x00, sizeof(filp_t)*NR_OPEN);
		for (fd_i = 0; fd_i < 3; fd_i++)
			task_table[curr_task->id].fildes_table[fd_i] = kmalloc(sizeof(fildes_t));
		task_table[curr_task->id].fildes_table[0]->node = findnode_fs("/dev/stdin");
		task_table[curr_task->id].fildes_table[1]->node = findnode_fs("/dev/stdout");
		task_table[curr_task->id].fildes_table[2]->node = findnode_fs("/dev/stderr");
		for (fd_i = 0; fd_i < 3; fd_i++)
			task_table[curr_task->id].fildes_table[fd_i]->count = 1;
	}
	spinlock_irqsave_unlock(&table_lock);

	err = read_fs(file, (uint8_t*)&header, sizeof(elf_header_t));
	if (err < 0) {
		kprintf("read_fs failed: %d\n", err);
		return err;
	}
	if (BUILTIN_EXPECT(header.ident.magic != ELF_MAGIC, 0))
		goto invalid;

	if (BUILTIN_EXPECT(header.type != ELF_ET_EXEC, 0))
		goto invalid;

	if (BUILTIN_EXPECT(header.machine != ELF_EM_386, 0))
		goto invalid;

	if (BUILTIN_EXPECT(header.ident._class != ELF_CLASS_32, 0))
		goto invalid;

	if (BUILTIN_EXPECT(header.ident.data != ELF_DATA_2LSB, 0))
		goto invalid;

	if (header.entry <= KERNEL_SPACE)
		goto invalid;

        // interpret program header table
	for (i=0; i<header.ph_entry_count; i++) {
		file->offset = header.ph_offset+i*header.ph_entry_size;
		if (read_fs(file, (uint8_t*)&prog_header, sizeof(elf_program_header_t)) == 0) {
			kprintf("Could not read programm header!\n");
			continue;
		}

		switch(prog_header.type)
		{
		case  ELF_PT_LOAD:  // load program segment
			if (!prog_header.virt_addr)
				continue;

			npages = (prog_header.mem_size >> PAGE_SHIFT);
			if (prog_header.mem_size & (PAGE_SIZE-1))
				npages++;

			addr = get_pages(npages);

			flags = MAP_USER_SPACE;
			if (prog_header.flags & PF_X)
				flags |= MAP_CODE;

			// map page frames in the address space of the current task
			if (!map_region(prog_header.virt_addr, addr, npages, flags))
				kprintf("Could not map 0x%x at 0x%x\n", addr, prog_header.virt_addr);

			// clear pages
			memset((void*) prog_header.virt_addr, 0x00, npages*PAGE_SIZE);

			// set starting point of the heap
			if (curr_task->start_heap < prog_header.virt_addr+prog_header.mem_size)
				curr_task->start_heap = curr_task->end_heap = prog_header.virt_addr+prog_header.mem_size;

			// load program
			file->offset = prog_header.offset;
			read_fs(file, (uint8_t*)prog_header.virt_addr, prog_header.file_size);

			flags = VMA_CACHEABLE;
			if (prog_header.flags & PF_R)
				flags |= VMA_READ;
			if (prog_header.flags & PF_W)
				flags |= VMA_WRITE;
			if (prog_header.flags & PF_X)
				flags |= VMA_EXECUTE;
			vma_add(curr_task, prog_header.virt_addr, prog_header.virt_addr+npages*PAGE_SIZE-1, flags);

			if (!(prog_header.flags & PF_W)) 
				change_page_permissions(prog_header.virt_addr, prog_header.virt_addr+npages*PAGE_SIZE-1, flags);
			break;

		case ELF_PT_GNU_STACK: // Indicates stack executability
			// create user-level stack
			npages = DEFAULT_STACK_SIZE >> PAGE_SHIFT;
			if (DEFAULT_STACK_SIZE & (PAGE_SIZE-1))
				npages++;

			addr = get_pages(npages); 
			stack = header.entry*2; // virtual address of the stack

			if (!map_region(stack, addr, npages, MAP_USER_SPACE)) {
				kprintf("Could not map stack at 0x%x\n", stack);
				return -ENOMEM;
			}
			memset((void*) stack, 0x00, npages*PAGE_SIZE);

			// create vma regions for the user-level stack
			flags = VMA_CACHEABLE;
			if (prog_header.flags & PF_R)
				flags |= VMA_READ;
			if (prog_header.flags & PF_W)
				flags |= VMA_WRITE;
			if (prog_header.flags & PF_X)
				flags |= VMA_EXECUTE;
			vma_add(curr_task, stack, stack+npages*PAGE_SIZE-1, flags);
			break;
		 }
	}

#if 0
	// interpret section header table
	for (i=0; i<header.sh_entry_count; i++) {
		file.offset = header.sh_offset+i*header.sh_entry_size;
		if (read_fs(&file, (uint8_t*)&sec_header, sizeof(elf_section_header_t)) == 0) {
			kprintf("Could not read section header!\n");
			continue;
		}

		// TODO: interpret section header
	}
#endif

	if (BUILTIN_EXPECT(!stack, 0)) {
		kprintf("Stack is missing!\n");
		return -ENOMEM;
	}

	// push strings on the stack
	offset = DEFAULT_STACK_SIZE-8;
	memset((void*) (stack+offset), 0, 4);
	offset -= MAX_ARGS;
	memcpy((void*) (stack+offset), largs->buffer, MAX_ARGS);
	idx = offset;

	// push argv on the stack
	offset -= largs->argc * sizeof(char*);
	for(i=0; i<largs->argc; i++) {
		((char**) (stack+offset))[i] = (char*) (stack+idx);
		
		while(((char*) stack)[idx] != '\0')
			idx++;
		idx++;
	}

	// push env on the stack
	offset -= (largs->envc+1) * sizeof(char*);
	for(i=0; i<largs->envc; i++) {
		((char**) (stack+offset))[i] = (char*) (stack+idx);

		while(((char*) stack)[idx] != '\0')
			idx++;
		idx++;
	}
	((char**) (stack+offset))[largs->envc] = NULL;

	// push pointer to env
	offset -= sizeof(char**);
	if (!(largs->envc))
		*((char***) (stack+offset)) = NULL;
	else
		*((char***) (stack+offset)) = (char**) (stack + offset + sizeof(char**));

	// push pointer to argv
	offset -= sizeof(char**);
	*((char***) (stack+offset)) = (char**) (stack + offset + 2*sizeof(char**) + (largs->envc+1) * sizeof(char*));

	// push argc on the stack
	offset -= sizeof(int);
	*((int*) (stack+offset)) = largs->argc;

	kfree(largs, sizeof(load_args_t));

	// clear fpu state
	curr_task->flags &= ~(TASK_FPU_USED|TASK_FPU_INIT);

	jump_to_user_code(header.entry, stack+offset);

	return 0;

invalid:
	kprintf("Invalid executable!\n");
	kprintf("magic number 0x%x\n", (uint32_t) header.ident.magic);
	kprintf("header type 0x%x\n", (uint32_t) header.type);
	kprintf("machine type 0x%x\n", (uint32_t) header.machine);
	kprintf("elf ident class 0x%x\n", (uint32_t) header.ident._class);
	kprintf("elf identdata !0x%x\n", header.ident.data);
	kprintf("program entry point 0x%x\n", (size_t) header.entry);

	return -EINVAL;
}

/** @brief This call is used to adapt create_task calls 
 * which want to have a start function and argument list */
static int STDCALL user_entry(void* arg)
{
	int ret;

	finish_task_switch(1);

	if (BUILTIN_EXPECT(!arg, 0))
		return -EINVAL;

	ret = load_task((load_args_t*) arg);

	kfree(arg, sizeof(load_args_t));

	return ret;
}

/** @brief Luxus-edition of create_user_task functions. Just call with an exe name
 *
 * @param id Pointer to the tid_t structure which shall be filles
 * @param fname Executable's path and filename
 * @param argv Arguments list
 * @return
 * - 0 on success
 * - -ENOMEM (-12) or -EINVAL (-22) on failure
 */
int create_user_task(tid_t* id, const char* fname, char** argv)
{
	vfs_node_t* node;
	int argc = 0;
	size_t i, buffer_size = 0;
	load_args_t* load_args = NULL;
	char *dest, *src;

	node = findnode_fs((char*) fname);
	if (!node || !(node->type == FS_FILE))
		return -EINVAL;

	// determine buffer size of argv
	if (argv) {
		while (argv[argc]) {
			buffer_size += (strlen(argv[argc]) + 1);
			argc++;
		}
	}

	if (argc <= 0)
		return -EINVAL;
	if (buffer_size >= MAX_ARGS)
		return -EINVAL;

	load_args = kmalloc(sizeof(load_args_t));
	if (BUILTIN_EXPECT(!load_args, 0))
		return -ENOMEM;
	load_args->node = node;
	load_args->argc = argc;
	load_args->envc = 0;
	dest = load_args->buffer;
	for (i=0; i<argc; i++) {
		src = argv[i];
		while ((*dest++ = *src++) != 0);
	}


	/* create new task */
	return create_task(id, user_entry, load_args, NORMAL_PRIO);
}

/** @brief Used by the execve-Systemcall */
int sys_execve(const char* fname, char** argv, char** env)
{
	vfs_node_t* node;
	vma_t* tmp;
	size_t i, buffer_size = 0;
	load_args_t* load_args = NULL;
	char *dest, *src;
	int ret, argc = 0;
	int envc = 0;
	task_t* curr_task = per_core(current_task);

	node = findnode_fs((char*) fname);
	if (!node || !(node->type == FS_FILE))
		return -EINVAL;

	// determine total buffer size of argv and env
	if (argv) {
		while (argv[argc]) {
			buffer_size += (strlen(argv[argc]) + 1);
			argc++;
		}
	}

	if (env) {
		while (env[envc]) {
			buffer_size += (strlen(env[envc]) + 1);
			envc++;
		}
	}

	if (argc <= 0)
		return -EINVAL;
	if (buffer_size >= MAX_ARGS)
		return -EINVAL;

	load_args = kmalloc(sizeof(load_args_t));
	if (BUILTIN_EXPECT(!load_args, 0))
		return -ENOMEM;

	load_args->node = node;
	load_args->argc = argc;
	load_args->envc = envc;
	dest = load_args->buffer;
	for (i=0; i<argc; i++) {
		src = argv[i];
		while ((*dest++ = *src++) != 0);
	}
	for (i=0; i<envc; i++) {
		src = env[i];
		while ((*dest++ = *src++) != 0);
	}

	spinlock_lock(&curr_task->vma_lock);

	// remove old program
	while((tmp = curr_task->vma_list) != NULL) {
		kfree((void*) tmp->start, tmp->end - tmp->start + 1);
		curr_task->vma_list = tmp->next;
		kfree((void*) tmp, sizeof(vma_t));
	}

	spinlock_unlock(&curr_task->vma_lock);

	/*
	 * we use a trap gate to enter the kernel
	 * => eflags are not changed
	 * => interrupts are enabled
	 * => we could directly load the new task
	 */

	ret = load_task(load_args);

	kfree(load_args, sizeof(load_args_t));

	return ret;
}

/** @brief Called by tasks which are waiting for another task's
 * return value. */
tid_t wait(int32_t* result) 
{
	task_t* curr_task = per_core(current_task);
	wait_msg_t tmp = { -1, -1};

	/* 
	 * idle tasks are not allowed to wait for another task 
	 * they should always run...
	 */
	if (BUILTIN_EXPECT(curr_task->status == TASK_IDLE, 0))
		return -EINVAL;

	mailbox_wait_msg_fetch(&curr_task->inbox, &tmp, 0);

	if (result)
		*result = tmp.result;

	return tmp.id;
}

/** @brief Wakeup a blocked task
 * @param id The task's tid_t structure
 * @return
 * - 0 on success
 * - -EINVAL (-22) on failure
 */
int wakeup_task(tid_t id)
{
	task_t* task;
	uint32_t core_id, prio;
	uint32_t flags;
	int ret = -EINVAL;

	flags = irq_nested_disable();

	task = task_table + id;
	prio = task->prio;
	core_id = task->last_core;

	if (task->status == TASK_BLOCKED) {
		task->status = TASK_READY;
		ret = 0;

		spinlock_irqsave_lock(&runqueues[core_id].lock);
		// increase the number of ready tasks
		runqueues[core_id].nr_tasks++;

		// do we need to remove from timer queue?
		if (task->flags & TASK_TIMER) {
			task->flags &= ~TASK_TIMER;
			if (task->prev)
				task->prev->next = task->next;
			if (task->next)
				task->next->prev = task->prev;
			if (runqueues[core_id].timers.first == task)
				runqueues[core_id].timers.first = task->next;
			if (runqueues[core_id].timers.last == task)
				runqueues[core_id].timers.last = task->prev;
		}

		// add task to the runqueue
		if (!runqueues[core_id].queue[prio-1].last) {
			runqueues[core_id].queue[prio-1].last = runqueues[core_id].queue[prio-1].first = task;
			task->next = task->prev = NULL;
			runqueues[core_id].prio_bitmap |= (1 << prio);
		} else {
			task->prev = runqueues[core_id].queue[prio-1].last;
			task->next = NULL;
			runqueues[core_id].queue[prio-1].last->next = task;
			runqueues[core_id].queue[prio-1].last = task;
		}
		spinlock_irqsave_unlock(&runqueues[core_id].lock);
	}

	irq_nested_enable(flags);

	return ret;
}

/** @brief Block current task
 *
 * The current task's status will be changed to TASK_BLOCKED
 *
 * @return
 * - 0 on success
 * - -EINVAL (-22) on failure
 */
int block_current_task(void)
{
	task_t* curr_task;
	tid_t id;
	uint32_t core_id, prio;
	uint32_t flags;
	int ret = -EINVAL;

	flags = irq_nested_disable();

	curr_task = per_core(current_task);
	id = curr_task->id;
	prio = curr_task->prio;
	core_id = CORE_ID;

	if (task_table[id].status == TASK_RUNNING) {
		task_table[id].status = TASK_BLOCKED;
		ret = 0;

		spinlock_irqsave_lock(&runqueues[core_id].lock);
		// reduce the number of ready tasks
		runqueues[core_id].nr_tasks--;

		// remove task from queue
		if (task_table[id].prev)
			task_table[id].prev->next = task_table[id].next;
		if (task_table[id].next)
			task_table[id].next->prev = task_table[id].prev;
		if (runqueues[core_id].queue[prio-1].first == task_table+id)
			runqueues[core_id].queue[prio-1].first = task_table[id].next;
		if (runqueues[core_id].queue[prio-1].last == task_table+id) {
			runqueues[core_id].queue[prio-1].last = task_table[id].prev;
			if (!runqueues[core_id].queue[prio-1].last)
				runqueues[core_id].queue[prio-1].last = runqueues[core_id].queue[prio-1].first;
		}

		// No valid task in queue => update prio_bitmap
		if (!runqueues[core_id].queue[prio-1].first)
			runqueues[core_id].prio_bitmap &= ~(1 << prio);

		spinlock_irqsave_unlock(&runqueues[core_id].lock);
	}

	irq_nested_enable(flags);

	return ret;
}

int set_timer(uint64_t deadline)
{
	task_t* curr_task;
	task_t* tmp;
	uint32_t core_id, prio;
	uint32_t flags;
	int ret = -EINVAL;

	flags = irq_nested_disable();

	curr_task = per_core(current_task);
	prio = curr_task->prio;
	core_id = CORE_ID;

	if (curr_task->status == TASK_RUNNING) {
		curr_task->status = TASK_BLOCKED;
		curr_task->timeout = deadline;
		curr_task->flags |= TASK_TIMER;
		ret = 0;

		spinlock_irqsave_lock(&runqueues[core_id].lock);

		// reduce the number of ready tasks
		runqueues[core_id].nr_tasks--;

		// remove task from queue
		if (curr_task->prev)
			curr_task->prev->next = curr_task->next;
		if (curr_task->next)
			curr_task->next->prev = curr_task->prev;
		if (runqueues[core_id].queue[prio-1].first == curr_task)
			runqueues[core_id].queue[prio-1].first = curr_task->next;
		if (runqueues[core_id].queue[prio-1].last == curr_task) {
			runqueues[core_id].queue[prio-1].last = curr_task->prev;
			if (!runqueues[core_id].queue[prio-1].last)
				runqueues[core_id].queue[prio-1].last = runqueues[core_id].queue[prio-1].first;
		}

		// No valid task in queue => update prio_bitmap
		if (!runqueues[core_id].queue[prio-1].first)
			runqueues[core_id].prio_bitmap &= ~(1 << prio);

		// add task to the timer queue
		tmp = runqueues[core_id].timers.first;
		if (!tmp) {
			runqueues[core_id].timers.first = runqueues[core_id].timers.last = curr_task;
			curr_task->prev = curr_task->next = NULL;
		} else {
			while(tmp && (deadline >= tmp->timeout))
				tmp = tmp->next;

			if (!tmp) {
				curr_task->next = NULL;
				curr_task->prev = runqueues[core_id].timers.last;
				if (runqueues[core_id].timers.last)
					runqueues[core_id].timers.last->next = curr_task;
				runqueues[core_id].timers.last = curr_task;
				// obsolete lines...
				//if (!runqueues[core_id].timers.first)
				//	runqueues[core_id].timers.first = curr_task;
			} else {
				curr_task->prev = tmp->prev;
				curr_task->next = tmp;
				tmp->prev = curr_task;
				if (curr_task->prev)
					curr_task->prev->next = curr_task;
				if (runqueues[core_id].timers.first == tmp)
					runqueues[core_id].timers.first = curr_task;
			}
		}

		spinlock_irqsave_unlock(&runqueues[core_id].lock);
	} else kprintf("Task is already blocked. No timer will be set!\n");

	irq_nested_enable(flags);

	return ret;
}

/* determining the load as fix-point */
#define FSHIFT	11		/* nr of bits of precision */
#define FIXED_1	(1<<FSHIFT)	/* 1.0 as fixed-point      */
#define EXP_1	1884		/* 1/exp(5sec/1min)        */
#define EXP_5	2014		/* 1/exp(5sec/5min)        */
#define EXP_15	2037		/* 1/exp(5sec/15min)       */

void update_load(void)
{
	uint32_t core_id = CORE_ID;

	runqueues[core_id].balance_counter--;
	runqueues[core_id].load_counter--;

	if (runqueues[core_id].load_counter <= 0) {
		runqueues[core_id].load_counter += TIMER_FREQ/5;

		spinlock_irqsave_lock(&runqueues[core_id].lock);
		runqueues[core_id].load[0] *= EXP_1;
		runqueues[core_id].load[0] += (runqueues[core_id].nr_tasks *FIXED_1) * (FIXED_1 - EXP_1);
		runqueues[core_id].load[0] >>= FSHIFT;
		runqueues[core_id].load[1] *= EXP_5;
		runqueues[core_id].load[1] += (runqueues[core_id].nr_tasks *FIXED_1) * (FIXED_1 - EXP_5);
		runqueues[core_id].load[1] >>= FSHIFT;
		runqueues[core_id].load[2] *= EXP_15;
		runqueues[core_id].load[2] += (runqueues[core_id].nr_tasks *FIXED_1) * (FIXED_1 - EXP_15);
		runqueues[core_id].load[2] >>= FSHIFT;
		spinlock_irqsave_unlock(&runqueues[core_id].lock);

		//kprintf("load of core %u: %u, %u, %u, %u\n", core_id, runqueues[core_id].load[0], runqueues[core_id].load[1], runqueues[core_id].load[2], runqueues[core_id].nr_tasks);
	}
}

#if MAX_CORES > 1
extern atomic_int32_t cpu_online;
#endif

void dump_load(void)
{
	uint32_t i;
#if MAX_CORES > 1
	uint32_t ncores = atomic_int32_read(&cpu_online);
#else
	uint32_t ncores = 1;
#endif

	for(i=0; i<ncores; i++)
	{
		kprintf("Load average of core %u: %u.%u, %u.%u, %u.%u\n",
				i, runqueues[i].load[0] >> FSHIFT,
				((runqueues[i].load[0] & ((1 << FSHIFT) - 1)) * 100) / (1 << FSHIFT),
				runqueues[i].load[1] >> FSHIFT,
				((runqueues[i].load[1] & ((1 << FSHIFT) - 1)) * 100) / (1 << FSHIFT),
				runqueues[i].load[2] >> FSHIFT,
				((runqueues[i].load[2] & ((1 << FSHIFT) - 1)) * 100) / (1 << FSHIFT));
	}
}

#if MAX_CORES > 1
void load_balancing(void)
{
#if 1
	uint32_t i, core_id = CORE_ID;
	uint32_t prio;
	task_t* task;

	spinlock_irqsave_lock(&runqueues[core_id].lock);
	for(i=0; (i<atomic_int32_read(&cpu_online)) && (runqueues[core_id].balance_counter <= 0); i++)
	{
		if (i == core_id)
			continue;

		spinlock_irqsave_lock(&runqueues[i].lock);
		if ((runqueues[i].load[0] >> (FSHIFT-1)) > (runqueues[core_id].load[0] >> (FSHIFT-1))) {
			//kprintf("Try to steal a task from core %u (load %u) to %u (load %u)\n", i, runqueues[i].load[0], core_id, runqueues[core_id].load[0]);
			//kprintf("Task on core %u: %u, core %u, %u\n", i, runqueues[i].nr_tasks, core_id, runqueues[i].nr_tasks);

			prio = lsb(runqueues[i].prio_bitmap);
			if (prio < sizeof(size_t)*8) {
				// steal a ready task
				task = runqueues[i].queue[prio-1].last;
				kprintf("Core %u steals the task %d form %u with prio %u\n", core_id, task->id, i, prio);

				// remove last element from queue i
				if (task->prev)
					task->prev->next = NULL;
				if (runqueues[i].queue[prio-1].first == task) {
					runqueues[i].queue[prio-1].first = runqueues[i].queue[prio-1].last = NULL;
					runqueues[i].prio_bitmap &= ~(1 << prio);
				} else runqueues[i].queue[prio-1].last = task->prev;

				// add task at the end of queue core_id
				if (!runqueues[core_id].queue[prio-1].last) {
					runqueues[core_id].queue[prio-1].first = runqueues[core_id].queue[prio-1].last = task;
					task->next = task->prev = NULL;
				} else {
					runqueues[core_id].queue[prio-1].last->next = task;
					task->prev = runqueues[core_id].queue[prio-1].last;
					runqueues[core_id].queue[prio-1].last = task;
					task->next = NULL;
				}
				runqueues[core_id].prio_bitmap |= (1 << prio);

				// update task counters
				runqueues[core_id].nr_tasks++;
				runqueues[i].nr_tasks--;
				runqueues[core_id].balance_counter = TIMER_FREQ/2;
			} /*else {
				task_t* tmp;

				// steal a blocked task
				task = runqueues[i].timers.first;
				if (!task) // Ups, found no valid task to steal
					goto no_task_found;

				kprintf("Core %u steals the blocked task %d from %u with prio %u\n", core_id, task->id, i, task->prio);

				// remove first timer from queue i
				if (runqueues[i].timers.first == runqueues[i].timers.last)
					runqueues[i].timers.first = runqueues[i].timers.last = NULL;
				else
					runqueues[i].timers.first = runqueues[i].timers.first->next;

				// add timer to queue core_id
				tmp = runqueues[core_id].timers.first;
				while(tmp && (task->timeout >= tmp->timeout))
					tmp = tmp->next;

				if (!tmp) {
					task->next = NULL;
					task->prev = runqueues[core_id].timers.last;
					if (runqueues[core_id].timers.last)
						runqueues[core_id].timers.last->next = task;
					runqueues[core_id].timers.last = task;
					if (!runqueues[core_id].timers.first)
						runqueues[core_id].timers.first = task;
				} else {
					task->prev = tmp->prev;
					task->next = tmp;
					tmp->prev = task;
					if (task->prev)
						task->prev->next = task;
					if (runqueues[core_id].timers.first == tmp)
						runqueues[core_id].timers.first = task;
				}

				// => reschedule on the new core
				task->last_core = CORE_ID;

				// update task counters
				runqueues[core_id].balance_counter = TIMER_FREQ/2;
			}*/
		}
//no_task_found:
		spinlock_irqsave_unlock(&runqueues[i].lock);
	}

	if (runqueues[core_id].balance_counter <= 0)
		runqueues[core_id].balance_counter = TIMER_FREQ/2;

	spinlock_irqsave_unlock(&runqueues[core_id].lock);
#endif
}
#endif

void scheduler(void)
{
	task_t* orig_task;
	task_t* curr_task;
	uint32_t core_id = CORE_ID;
	uint32_t prio;
	uint64_t current_tick;

	orig_task = curr_task = per_core(current_task);
	curr_task->last_core = core_id;

	/* signalizes that this task could be reused */
	if (curr_task->status == TASK_FINISHED)
		curr_task->status = TASK_INVALID;

	spinlock_irqsave_lock(&runqueues[core_id].lock);

	// check timers
	current_tick = get_clock_tick();
	while (runqueues[core_id].timers.first && runqueues[core_id].timers.first->timeout <= current_tick)
	{
		task_t* task = runqueues[core_id].timers.first;

		// remove timer from queue
		runqueues[core_id].timers.first = runqueues[core_id].timers.first->next;
		if (runqueues[core_id].timers.first)
			runqueues[core_id].timers.first->prev = NULL;
		else
			runqueues[core_id].timers.last = NULL;
		task->flags &= ~TASK_TIMER;

		// wakeup task
		if (task->status == TASK_BLOCKED) {
			task->status = TASK_READY;
			prio = task->prio;

			// increase the number of ready tasks
			runqueues[core_id].nr_tasks++;

			// add task to the runqueue
			if (!runqueues[core_id].queue[prio-1].first) {
				runqueues[core_id].queue[prio-1].last = runqueues[core_id].queue[prio-1].first = task;
				task->next = task->prev = NULL;
				runqueues[core_id].prio_bitmap |= (1 << prio);
			} else {
				task->prev = runqueues[core_id].queue[prio-1].last;
				task->next = NULL;
				runqueues[core_id].queue[prio-1].last->next = task;
				runqueues[core_id].queue[prio-1].last = task;
			}
		}
	}

	runqueues[core_id].old_task = NULL; // reset old task
	prio = msb(runqueues[core_id].prio_bitmap); // determines highest priority
#if MAX_CORES > 1
	if (prio >= sizeof(size_t)*8) {
		// push load balancing
		runqueues[core_id].balance_counter -= TIMER_FREQ/20;
		load_balancing();
		prio = msb(runqueues[core_id].prio_bitmap); // retry...
	}
#endif

	if (prio >= sizeof(size_t)*8) {
		if ((curr_task->status == TASK_RUNNING) || (curr_task->status == TASK_IDLE))
			goto get_task_out;
		curr_task = per_core(current_task) = runqueues[core_id].idle;
	} else {
		// Does the current task have an higher priority? => no task switch
		if ((curr_task->prio > prio) && (curr_task->status == TASK_RUNNING))
			goto get_task_out;

		if (curr_task->status == TASK_RUNNING) {
			curr_task->status = TASK_READY;
			runqueues[core_id].old_task = curr_task;
		}

		curr_task = per_core(current_task) = runqueues[core_id].queue[prio-1].first;
		if (BUILTIN_EXPECT(curr_task->status == TASK_INVALID, 0)) {
			pushbg(COL_RED);
			kprintf("Upps!!!!!!! Got invalid task %d, orig task %d\n", curr_task->id, orig_task->id);
			popbg();
		}
		curr_task->status = TASK_RUNNING;

		// remove new task from queue
		runqueues[core_id].queue[prio-1].first = curr_task->next;
		if (!curr_task->next) {
			runqueues[core_id].queue[prio-1].last = NULL;
			runqueues[core_id].prio_bitmap &= ~(1 << prio);
		}
		curr_task->next = curr_task->prev = NULL;
	}

get_task_out:
	spinlock_irqsave_unlock(&runqueues[core_id].lock);

	if (curr_task != orig_task) {
		/* if the original task is using the FPU, we need to save the FPU context */
		if ((orig_task->flags & TASK_FPU_USED) && (orig_task->status == TASK_READY)) {
			save_fpu_state(&(orig_task->fpu));
			orig_task->flags &= ~TASK_FPU_USED;
		}

		//kprintf("schedule from %u to %u with prio %u on core %u\n",
		//	orig_task->id, curr_task->id, (uint32_t)curr_task->prio, CORE_ID);
		switch_task(curr_task->id);
		finish_task_switch(0);		
	}
}

void reschedule(void)
{
	uint32_t flags = irq_nested_disable();
	scheduler();
	irq_nested_enable(flags);
}