diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c
index 6eb44ca5dba6252e6971cce1708211916505b042..bb0bb34555da4bbbc7a8eb258dc9a198d0cde107 100644
--- a/arch/sparc64/kernel/irq.c
+++ b/arch/sparc64/kernel/irq.c
@@ -1018,21 +1018,29 @@ static void __cpuinit init_cpu_send_mondo_info(struct trap_per_cpu *tb, int use_
 }
 
 /* Allocate and register the mondo and error queues for this cpu.  */
-void __cpuinit sun4v_init_mondo_queues(int use_bootmem)
+void __cpuinit sun4v_init_mondo_queues(int use_bootmem, int cpu, int alloc, int load)
 {
-	int cpu = hard_smp_processor_id();
 	struct trap_per_cpu *tb = &trap_block[cpu];
 
-	alloc_one_mondo(&tb->cpu_mondo_pa, use_bootmem);
-	alloc_one_mondo(&tb->dev_mondo_pa, use_bootmem);
-	alloc_one_mondo(&tb->resum_mondo_pa, use_bootmem);
-	alloc_one_kbuf(&tb->resum_kernel_buf_pa, use_bootmem);
-	alloc_one_mondo(&tb->nonresum_mondo_pa, use_bootmem);
-	alloc_one_kbuf(&tb->nonresum_kernel_buf_pa, use_bootmem);
+	if (alloc) {
+		alloc_one_mondo(&tb->cpu_mondo_pa, use_bootmem);
+		alloc_one_mondo(&tb->dev_mondo_pa, use_bootmem);
+		alloc_one_mondo(&tb->resum_mondo_pa, use_bootmem);
+		alloc_one_kbuf(&tb->resum_kernel_buf_pa, use_bootmem);
+		alloc_one_mondo(&tb->nonresum_mondo_pa, use_bootmem);
+		alloc_one_kbuf(&tb->nonresum_kernel_buf_pa, use_bootmem);
 
-	init_cpu_send_mondo_info(tb, use_bootmem);
+		init_cpu_send_mondo_info(tb, use_bootmem);
+	}
 
-	sun4v_register_mondo_queues(cpu);
+	if (load) {
+		if (cpu != hard_smp_processor_id()) {
+			prom_printf("SUN4V: init mondo on cpu %d not %d\n",
+				    cpu, hard_smp_processor_id());
+			prom_halt();
+		}
+		sun4v_register_mondo_queues(cpu);
+	}
 }
 
 /* Only invoked on boot processor. */
@@ -1043,7 +1051,7 @@ void __init init_IRQ(void)
 	memset(&ivector_table[0], 0, sizeof(ivector_table));
 
 	if (tlb_type == hypervisor)
-		sun4v_init_mondo_queues(1);
+		sun4v_init_mondo_queues(1, hard_smp_processor_id(), 1, 1);
 
 	/* We need to clear any IRQ's pending in the soft interrupt
 	 * registers, a spurious one could be left around from the
diff --git a/arch/sparc64/kernel/setup.c b/arch/sparc64/kernel/setup.c
index 06807cf95ee15a989ec56d43af1b5d88b295ecdf..9b0c409d5b6a786f69bc7a0943fb518f9c1a828b 100644
--- a/arch/sparc64/kernel/setup.c
+++ b/arch/sparc64/kernel/setup.c
@@ -384,7 +384,7 @@ void __init setup_arch(char **cmdline_p)
 	paging_init();
 
 	/* Get boot processor trap_block[] setup.  */
-	init_cur_cpu_trap();
+	init_cur_cpu_trap(current_thread_info());
 }
 
 static int __init set_preferred_console(void)
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 527dfd7ae210708297d47c2f1fb3cb2b6835c9bc..b586345fe3b9651ebd6aa6fb89a880370ef54a78 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -316,6 +316,8 @@ static void smp_synchronize_one_tick(int cpu)
 	spin_unlock_irqrestore(&itc_sync_lock, flags);
 }
 
+extern void sun4v_init_mondo_queues(int use_bootmem, int cpu, int alloc, int load);
+
 extern unsigned long sparc64_cpu_startup;
 
 /* The OBP cpu startup callback truncates the 3rd arg cookie to
@@ -339,6 +341,9 @@ static int __devinit smp_boot_one_cpu(unsigned int cpu)
 	cpu_set(cpu, cpu_callout_map);
 
 	if (tlb_type == hypervisor) {
+		/* Alloc the mondo queues, cpu will load them.  */
+		sun4v_init_mondo_queues(0, cpu, 1, 0);
+
 		prom_startcpu_cpuid(cpu, entry, cookie);
 	} else {
 		int cpu_node;
@@ -352,6 +357,7 @@ static int __devinit smp_boot_one_cpu(unsigned int cpu)
 			break;
 		udelay(100);
 	}
+
 	if (callin_flag) {
 		ret = 0;
 	} else {
diff --git a/arch/sparc64/kernel/trampoline.S b/arch/sparc64/kernel/trampoline.S
index b9c9f54b0a00bd5c51e853bcf1e5a9950c30f914..a4dc01a3d23842afa0ae05ecfaa52ab15e4f9cae 100644
--- a/arch/sparc64/kernel/trampoline.S
+++ b/arch/sparc64/kernel/trampoline.S
@@ -30,12 +30,16 @@ itlb_load:
 dtlb_load:
 	.asciz	"SUNW,dtlb-load"
 
+	/* XXX __cpuinit this thing XXX */
+#define TRAMP_STACK_SIZE	1024
+	.align	16
+tramp_stack:
+	.skip	TRAMP_STACK_SIZE
+
 	.text
 	.align		8
 	.globl		sparc64_cpu_startup, sparc64_cpu_startup_end
 sparc64_cpu_startup:
-	flushw
-
 	BRANCH_IF_SUN4V(g1, niagara_startup)
 	BRANCH_IF_CHEETAH_BASE(g1, g5, cheetah_startup)
 	BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1, g5, cheetah_plus_startup)
@@ -58,6 +62,7 @@ cheetah_startup:
 	or	%g5, DCU_DM | DCU_IM | DCU_DC | DCU_IC, %g5
 	stxa	%g5, [%g0] ASI_DCU_CONTROL_REG
 	membar	#Sync
+	/* fallthru */
 
 cheetah_generic_startup:
 	mov	TSB_EXTENSION_P, %g3
@@ -90,19 +95,17 @@ spitfire_startup:
 	membar		#Sync
 
 startup_continue:
-	wrpr		%g0, 15, %pil
-
 	sethi		%hi(0x80000000), %g2
 	sllx		%g2, 32, %g2
 	wr		%g2, 0, %tick_cmpr
 
+	mov		%o0, %l0
+
 	BRANCH_IF_SUN4V(g1, niagara_lock_tlb)
 
 	/* Call OBP by hand to lock KERNBASE into i/d tlbs.
 	 * We lock 2 consequetive entries if we are 'bigkernel'.
 	 */
-	mov		%o0, %l0
-
 	sethi		%hi(prom_entry_lock), %g2
 1:	ldstub		[%g2 + %lo(prom_entry_lock)], %g1
 	membar		#StoreLoad | #StoreStore
@@ -112,7 +115,6 @@ startup_continue:
 	sethi		%hi(p1275buf), %g2
 	or		%g2, %lo(p1275buf), %g2
 	ldx		[%g2 + 0x10], %l2
-	mov		%sp, %l1
 	add		%l2, -(192 + 128), %sp
 	flushw
 
@@ -308,18 +310,9 @@ niagara_lock_tlb:
 	ta		HV_FAST_TRAP
 
 after_lock_tlb:
-	mov		%l1, %sp
-	flushw
-
-	mov		%l0, %o0
-
 	wrpr		%g0, (PSTATE_PRIV | PSTATE_PEF), %pstate
 	wr		%g0, 0, %fprs
 
-	/* XXX Buggy PROM... */
-	srl		%o0, 0, %o0
-	ldx		[%o0], %g6
-
 	wr		%g0, ASI_P, %asi
 
 	mov		PRIMARY_CONTEXT, %g7
@@ -341,22 +334,25 @@ after_lock_tlb:
 
 	membar		#Sync
 
-	mov		1, %g5
-	sllx		%g5, THREAD_SHIFT, %g5
-	sub		%g5, (STACKFRAME_SZ + STACK_BIAS), %g5
-	add		%g6, %g5, %sp
+	/* Everything we do here, until we properly take over the
+	 * trap table, must be done with extreme care.  We cannot
+	 * make any references to %g6 (current thread pointer),
+	 * %g4 (current task pointer), or %g5 (base of current cpu's
+	 * per-cpu area) until we properly take over the trap table
+	 * from the firmware and hypervisor.
+	 *
+	 * Get onto temporary stack which is in the locked kernel image.
+	 */
+	sethi		%hi(tramp_stack), %g1
+	or		%g1, %lo(tramp_stack), %g1
+	add		%g1, TRAMP_STACK_SIZE, %g1
+	sub		%g1, STACKFRAME_SZ + STACK_BIAS, %sp
 	mov		0, %fp
 
-	wrpr		%g0, 0, %wstate
-	wrpr		%g0, 0, %tl
-
-	/* Load TBA, then we can resurface. */
-	sethi		%hi(sparc64_ttable_tl0), %g5
-	wrpr		%g5, %tba
-
-	ldx		[%g6 + TI_TASK], %g4
-
-	wrpr		%g0, 0, %wstate
+	/* Put garbage in these registers to trap any access to them.  */
+	set		0xdeadbeef, %g4
+	set		0xdeadbeef, %g5
+	set		0xdeadbeef, %g6
 
 	call		init_irqwork_curcpu
 	 nop
@@ -367,11 +363,17 @@ after_lock_tlb:
 	bne,pt		%icc, 1f
 	 nop
 
+	call		hard_smp_processor_id
+	 nop
+	
+	mov		%o0, %o1
+	mov		0, %o0
+	mov		0, %o2
 	call		sun4v_init_mondo_queues
-	 mov		0, %o0
+	 mov		1, %o3
 
 1:	call		init_cur_cpu_trap
-	 nop
+	 ldx		[%l0], %o0
 
 	/* Start using proper page size encodings in ctx register.  */
 	sethi		%hi(sparc64_kern_pri_context), %g3
@@ -386,9 +388,14 @@ after_lock_tlb:
 
 	membar		#Sync
 
-	rdpr		%pstate, %o1
-	or		%o1, PSTATE_IE, %o1
-	wrpr		%o1, 0, %pstate
+	wrpr		%g0, 0, %wstate
+
+	/* As a hack, put &init_thread_union into %g6.
+	 * prom_world() loads from here to restore the %asi
+	 * register.
+	 */
+	sethi		%hi(init_thread_union), %g6
+	or		%g6, %lo(init_thread_union), %g6
 
 	sethi		%hi(is_sun4v), %o0
 	lduw		[%o0 + %lo(is_sun4v)], %o0
@@ -418,7 +425,20 @@ after_lock_tlb:
 1:	call		prom_set_trap_table
 	 sethi		%hi(sparc64_ttable_tl0), %o0
 
-2:	call		smp_callin
+2:	ldx		[%l0], %g6
+	ldx		[%g6 + TI_TASK], %g4
+
+	mov		1, %g5
+	sllx		%g5, THREAD_SHIFT, %g5
+	sub		%g5, (STACKFRAME_SZ + STACK_BIAS), %g5
+	add		%g6, %g5, %sp
+	mov		0, %fp
+
+	rdpr		%pstate, %o1
+	or		%o1, PSTATE_IE, %o1
+	wrpr		%o1, 0, %pstate
+
+	call		smp_callin
 	 nop
 	call		cpu_idle
 	 mov		0, %o0
diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c
index 5956d0a940095c40c9d3ad503e3e0c88f6baf9fa..c9484ae5bb8fc339a0fd3bef7bcd4e523d6d76af 100644
--- a/arch/sparc64/kernel/traps.c
+++ b/arch/sparc64/kernel/traps.c
@@ -2413,12 +2413,12 @@ struct trap_per_cpu trap_block[NR_CPUS];
 /* This can get invoked before sched_init() so play it super safe
  * and use hard_smp_processor_id().
  */
-void init_cur_cpu_trap(void)
+void init_cur_cpu_trap(struct thread_info *t)
 {
 	int cpu = hard_smp_processor_id();
 	struct trap_per_cpu *p = &trap_block[cpu];
 
-	p->thread = current_thread_info();
+	p->thread = t;
 	p->pgd_paddr = 0;
 }
 
diff --git a/include/asm-sparc64/cpudata.h b/include/asm-sparc64/cpudata.h
index 5a970f5ed9bd89ea2fc3d468c8246b0722aa2035..771aa94dfd95e2bcfaca00f986b2eb399b9e74bb 100644
--- a/include/asm-sparc64/cpudata.h
+++ b/include/asm-sparc64/cpudata.h
@@ -77,7 +77,7 @@ struct trap_per_cpu {
 	unsigned long		__pad2[4];
 } __attribute__((aligned(64)));
 extern struct trap_per_cpu trap_block[NR_CPUS];
-extern void init_cur_cpu_trap(void);
+extern void init_cur_cpu_trap(struct thread_info *);
 extern void setup_tba(void);
 
 #ifdef CONFIG_SMP