diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ad9e9f36..08f925c3 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -268,7 +268,7 @@ static inline void tlb_flush(void)
 static inline uint32_t read_eflags(void)
 {
 	uint32_t result;
-	asm volatile ("pushf; pop $0" : "=r"(result));
+	asm volatile ("pushf; pop %0" : "=r"(result));
 	return result;
 }
 
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index da4c65f4..ca860d4f 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -320,6 +320,8 @@ void smp_start(uint32_t id)
 	// enable additional cpu features
 	cpu_detection();
 
+	kprintf("CR0 of core %u: 0x%x\n", apic_cpu_id(), read_cr0());
+
 	smp_main();
 
 	// idle loop
@@ -355,6 +357,8 @@ int smp_init(void)
 	if (ncores <= 1)
 		return -EINVAL;
 
+	kprintf("CR0 of core %u: 0x%x\n", apic_cpu_id(), read_cr0());
+
 	for(i=1; (i<ncores) && (i<MAX_CORES); i++)
 	{
 		/*
diff --git a/arch/x86/kernel/timer.c b/arch/x86/kernel/timer.c
index 70f62b1c..0ea49d0b 100644
--- a/arch/x86/kernel/timer.c
+++ b/arch/x86/kernel/timer.c
@@ -79,12 +79,16 @@ static void timer_handler(struct state *s)
 		/*if (timer_ticks % TIMER_FREQ == 0) {
 			vga_puts("One second has passed\n");
 		}*/
+
+		/* Dump load every minute */
+		//if (timer_ticks % (TIMER_FREQ*60) == 0)
+		//	dump_load();
 	}
 
 	update_load();
 
 #if MAX_CORES > 1
-	if ((atomic_int32_read(&cpu_online) > 1) && (timer_ticks % (TIMER_FREQ/5) == 0))
+	if (atomic_int32_read(&cpu_online) > 1)
 		load_balancing();
 #endif
 }
diff --git a/include/metalsvm/tasks.h b/include/metalsvm/tasks.h
index 3f4589cb..55056cbd 100644
--- a/include/metalsvm/tasks.h
+++ b/include/metalsvm/tasks.h
@@ -92,6 +92,11 @@ tid_t wait(int32_t* result);
  */
 void update_load(void);
 
+/** @brief Print the current cpu load
+ *
+ */
+void dump_load(void);
+
 #if MAX_CORES > 1
 /** @brief Load balancer
  *
@@ -103,7 +108,8 @@ void load_balancing(void);
 
 /** @brief Task switcher
  *
- * Timer-interrupted use of this function for task switching */
+ * Timer-interrupted use of this function for task switching
+ */
 void scheduler(void);
 
 /** @brief Wake up a blocked task
diff --git a/include/metalsvm/tasks_types.h b/include/metalsvm/tasks_types.h
index 6813896e..3e0253cd 100644
--- a/include/metalsvm/tasks_types.h
+++ b/include/metalsvm/tasks_types.h
@@ -123,7 +123,7 @@ typedef struct {
 	/// total number of tasks in the queue
 	uint32_t	nr_tasks;
 	// current load = average number of tasks in the queue (1-minute average)
-	uint32_t	load;
+	uint32_t	load[3];
 	// help counter to determine the the cpu load
 	int32_t 	load_counter;
 	// help counter to avoid "over balancing"
diff --git a/kernel/tasks.c b/kernel/tasks.c
index 039d7ad1..db30571f 100644
--- a/kernel/tasks.c
+++ b/kernel/tasks.c
@@ -53,11 +53,11 @@ static task_t task_table[MAX_TASKS] = { \
 static spinlock_irqsave_t table_lock = SPINLOCK_IRQSAVE_INIT;
 #if MAX_CORES > 1
 static runqueue_t runqueues[MAX_CORES] = { \
-		[0]                 = {task_table+0, NULL, 0, 0, 0, 0, 0, {[0 ... MAX_PRIO-1] = {NULL, NULL}}, {NULL, NULL}, SPINLOCK_IRQSAVE_INIT}, \
-		[1 ... MAX_CORES-1] = {NULL,         NULL, 0, 0, 0, 0, 0, {[0 ... MAX_PRIO-1] = {NULL, NULL}}, {NULL, NULL}, SPINLOCK_IRQSAVE_INIT}};
+		[0]                 = {task_table+0, NULL, 0, {[0 ... 2] = 0}, TIMER_FREQ/5, TIMER_FREQ/2, 0, {[0 ... MAX_PRIO-1] = {NULL, NULL}}, {NULL, NULL}, SPINLOCK_IRQSAVE_INIT}, \
+		[1 ... MAX_CORES-1] = {NULL,         NULL, 0, {[0 ... 2] = 0}, TIMER_FREQ/5, TIMER_FREQ/2, 0, {[0 ... MAX_PRIO-1] = {NULL, NULL}}, {NULL, NULL}, SPINLOCK_IRQSAVE_INIT}};
 #else
 static runqueue_t runqueues[1] = { \
-		[0]                 = {task_table+0, NULL, 0, 0, 0, 0, 0, {[0 ... MAX_PRIO-1] = {NULL, NULL}}, {NULL, NULL}, SPINLOCK_IRQSAVE_INIT}};
+		[0]                 = {task_table+0, NULL, 0, {[0 ... 2] = 0}, TIMER_FREQ/5, TIMER_FREQ/2, 0, {[0 ... MAX_PRIO-1] = {NULL, NULL}}, {NULL, NULL}, SPINLOCK_IRQSAVE_INIT}};
 #endif
 
 DEFINE_PER_CORE(task_t*, current_task, task_table+0);
@@ -1079,63 +1079,96 @@ int set_timer(uint64_t deadline)
 	return ret;
 }
 
-#define FSHIFT	21		/* nr of bits of precision (e.g. 11) */
-#define FIXED_1	(1<<FSHIFT)	/* 1.0 as fixed-point */
-#define EXP	1884		/* 1/exp(5sec/1min) as fixed-point */
+/* determining the load as fix-point */
+#define FSHIFT	11		/* nr of bits of precision */
+#define FIXED_1	(1<<FSHIFT)	/* 1.0 as fixed-point      */
+#define EXP_1	1884		/* 1/exp(5sec/1min)        */
+#define EXP_5	2014		/* 1/exp(5sec/5min)        */
+#define EXP_15	2037		/* 1/exp(5sec/15min)       */
 
 void update_load(void)
 {
 	uint32_t core_id = CORE_ID;
 
+	runqueues[core_id].balance_counter--;
 	runqueues[core_id].load_counter--;
-	if (runqueues[core_id].balance_counter > 0)
-		runqueues[core_id].balance_counter--;
-	if (runqueues[core_id].load_counter < 0) {
-		runqueues[core_id].load_counter += 5*TIMER_FREQ;
+
+	if (runqueues[core_id].load_counter <= 0) {
+		runqueues[core_id].load_counter += TIMER_FREQ/5;
 
 		spinlock_irqsave_lock(&runqueues[core_id].lock);
-		runqueues[core_id].load *= EXP;
-		runqueues[core_id].load += runqueues[core_id].nr_tasks*(FIXED_1-EXP);
-		runqueues[core_id].load >>= FSHIFT;
+		runqueues[core_id].load[0] *= EXP_1;
+		runqueues[core_id].load[0] += (runqueues[core_id].nr_tasks *FIXED_1) * (FIXED_1 - EXP_1);
+		runqueues[core_id].load[0] >>= FSHIFT;
+		runqueues[core_id].load[1] *= EXP_5;
+		runqueues[core_id].load[1] += (runqueues[core_id].nr_tasks *FIXED_1) * (FIXED_1 - EXP_5);
+		runqueues[core_id].load[1] >>= FSHIFT;
+		runqueues[core_id].load[2] *= EXP_15;
+		runqueues[core_id].load[2] += (runqueues[core_id].nr_tasks *FIXED_1) * (FIXED_1 - EXP_15);
+		runqueues[core_id].load[2] >>= FSHIFT;
 		spinlock_irqsave_unlock(&runqueues[core_id].lock);
 
-		//kprintf("load of core %u: %u, %u\n", core_id, runqueues[core_id].load, runqueues[core_id].nr_tasks);
+		//kprintf("load of core %u: %u, %u, %u, %u\n", core_id, runqueues[core_id].load[0], runqueues[core_id].load[1], runqueues[core_id].load[2], runqueues[core_id].nr_tasks);
 	}
 }
 
 #if MAX_CORES > 1
 extern atomic_int32_t cpu_online;
+#endif
 
+void dump_load(void)
+{
+	uint32_t i;
+#if MAX_CORES > 1
+	uint32_t ncores = atomic_int32_read(&cpu_online);
+#else
+	uint32_t ncores = 1;
+#endif
+
+	for(i=0; i<ncores; i++)
+	{
+		kprintf("Load average of core %u: %u.%u, %u.%u, %u.%u\n",
+				i, runqueues[i].load[0] >> FSHIFT,
+				((runqueues[i].load[0] & ((1 << FSHIFT) - 1)) * 100) / (1 << FSHIFT),
+				runqueues[i].load[1] >> FSHIFT,
+				((runqueues[i].load[1] & ((1 << FSHIFT) - 1)) * 100) / (1 << FSHIFT),
+				runqueues[i].load[2] >> FSHIFT,
+				((runqueues[i].load[2] & ((1 << FSHIFT) - 1)) * 100) / (1 << FSHIFT));
+	}
+}
+
+#if MAX_CORES > 1
 void load_balancing(void)
 {
-#if 0
+#if 1
 	uint32_t i, core_id = CORE_ID;
 	uint32_t prio;
 	task_t* task;
 
-	spinlock_lock(&runqueues[core_id].lock);
+	spinlock_irqsave_lock(&runqueues[core_id].lock);
 	for(i=0; (i<atomic_int32_read(&cpu_online)) && (runqueues[core_id].balance_counter <= 0); i++)
 	{
 		if (i == core_id)
-			break;
+			continue;
 
-		spinlock_lock(&runqueues[i].lock);
-		if (runqueues[i].load > runqueues[core_id].load) {
-			kprintf("Try to steal a task from core %u (load %u) to %u (load %u)\n", i, runqueues[i].load, core_id, runqueues[core_id].load);
-			kprintf("Task on core %u: %u, core %u, %u\n", i, runqueues[i].nr_tasks, core_id, runqueues[i].nr_tasks);
+		spinlock_irqsave_lock(&runqueues[i].lock);
+		if ((runqueues[i].load[0] >> (FSHIFT-1)) > (runqueues[core_id].load[0] >> (FSHIFT-1))) {
+			//kprintf("Try to steal a task from core %u (load %u) to %u (load %u)\n", i, runqueues[i].load[0], core_id, runqueues[core_id].load[0]);
+			//kprintf("Task on core %u: %u, core %u, %u\n", i, runqueues[i].nr_tasks, core_id, runqueues[i].nr_tasks);
 
-			prio = last_set(runqueues[i].prio_bitmap);
-			if (prio) {
+			prio = lsb(runqueues[i].prio_bitmap);
+			if (prio < sizeof(size_t)*8) {
 				// steal a ready task
 				task = runqueues[i].queue[prio-1].last;
-				kprintf("Try to steal a ready task %d\n", task->id);
+				kprintf("Core %u steals the task %d form %u with prio %u\n", core_id, task->id, i, prio);
 
 				// remove last element from queue i
 				if (task->prev)
 					task->prev->next = NULL;
-				runqueues[i].queue[prio-1].last = task->prev;
-				if (!runqueues[i].queue[prio-1].last)
-					runqueues[i].queue[prio-1].first = NULL;
+				if (runqueues[i].queue[prio-1].first == task) {
+					runqueues[i].queue[prio-1].first = runqueues[i].queue[prio-1].last = NULL;
+					runqueues[i].prio_bitmap &= ~(1 << prio);
+				} else runqueues[i].queue[prio-1].last = task->prev;
 
 				// add task at the end of queue core_id
 				if (!runqueues[core_id].queue[prio-1].last) {
@@ -1147,12 +1180,13 @@ void load_balancing(void)
 					runqueues[core_id].queue[prio-1].last = task;
 					task->next = NULL;
 				}
+				runqueues[core_id].prio_bitmap |= (1 << prio);
 
 				// update task counters
 				runqueues[core_id].nr_tasks++;
 				runqueues[i].nr_tasks--;
-				runqueues[core_id].balance_counter = 5*TIMER_FREQ;
-			} else {
+				runqueues[core_id].balance_counter = TIMER_FREQ/2;
+			} /*else {
 				task_t* tmp;
 
 				// steal a blocked task
@@ -1160,7 +1194,7 @@ void load_balancing(void)
 				if (!task) // Ups, found no valid task to steal
 					goto no_task_found;
 
-				kprintf("Try to steal blocked task %d\n", task->id);
+				kprintf("Core %u steals the blocked task %d from %u with prio %u\n", core_id, task->id, i, task->prio);
 
 				// remove first timer from queue i
 				if (runqueues[i].timers.first == runqueues[i].timers.last)
@@ -1195,15 +1229,17 @@ void load_balancing(void)
 				task->last_core = CORE_ID;
 
 				// update task counters
-				runqueues[core_id].nr_tasks++;
-				runqueues[i].nr_tasks--;
-				runqueues[core_id].balance_counter = 5*TIMER_FREQ;
-			}
+				runqueues[core_id].balance_counter = TIMER_FREQ/2;
+			}*/
 		}
-no_task_found:
-		spinlock_unlock(&runqueues[i].lock);
+//no_task_found:
+		spinlock_irqsave_unlock(&runqueues[i].lock);
 	}
-	spinlock_unlock(&runqueues[core_id].lock);
+
+	if (runqueues[core_id].balance_counter <= 0)
+		runqueues[core_id].balance_counter = TIMER_FREQ/2;
+
+	spinlock_irqsave_unlock(&runqueues[core_id].lock);
 #endif
 }
 #endif
@@ -1271,6 +1307,8 @@ void scheduler(void)
 	prio = msb(runqueues[core_id].prio_bitmap); // determines highest priority
 #if MAX_CORES > 1
 	if (prio >= sizeof(size_t)*8) {
+		// push load balancing
+		runqueues[core_id].balance_counter -= TIMER_FREQ/20;
 		load_balancing();
 		prio = msb(runqueues[core_id].prio_bitmap); // retry...
 	}
diff --git a/kernel/tests.c b/kernel/tests.c
index e8390fa6..c01d806b 100644
--- a/kernel/tests.c
+++ b/kernel/tests.c
@@ -87,7 +87,7 @@ static int foo(void* arg)
 		return 0;
 
 	for(i=0; i<5; i++) {
-		kprintf("Message from core %d: %s\n", smp_id(), (char*) arg);
+		kprintf("%s\n", (char*) arg);
 		sleep(1);
 	}
 
@@ -262,7 +262,7 @@ static int join_test(void* arg)
 	tid_t 	id, ret;
 	int 	result = -1234;
 
-	create_kernel_task(&id, foo, "Hello from foo2", HIGH_PRIO);
+	create_kernel_task(&id, foo, "Hello from foo2", HIGH_PRIO-1);
 
 	kprintf("Wait for child %u\n", id);
 	do {
@@ -278,7 +278,7 @@ int test_init(void)
 {
 //	char* argv[] = {"/bin/mshell", NULL};
 	char* argv[] = {"/bin/tests", NULL};
-//	char* server_argv[] = {"/bin/server", "6789", NULL};
+	char* server_argv[] = {"/bin/server", "6789", NULL};
 //	char* client_argv[] = {"/bin/client", "127.0.0.1", "6789", NULL};
 
 	sem_init(&producing, 1);