diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index bc4f562bd4594762a04dc7a845be8b0fcfa6b763..7594764d8a69c69040bfd18a2ccb0a8e17c84464 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -100,9 +100,14 @@ extern void cpu_idle_on_new_stack(struct thread_info *old_ti,
 
 #else /* __ASSEMBLY__ */
 
-/* how to get the thread information struct from ASM */
+/*
+ * How to get the thread information struct from assembly.
+ * Note that we use different macros since different architectures
+ * have different semantics in their "mm" instruction and we would
+ * like to guarantee that the macro expands to exactly one instruction.
+ */
 #ifdef __tilegx__
-#define GET_THREAD_INFO(reg) move reg, sp; mm reg, zero, LOG2_THREAD_SIZE, 63
+#define EXTRACT_THREAD_INFO(reg) mm reg, zero, LOG2_THREAD_SIZE, 63
 #else
 #define GET_THREAD_INFO(reg) mm reg, sp, zero, LOG2_THREAD_SIZE, 31
 #endif
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index 5d56a1ef5ba5212e6cffe97e0fe14223bc0d116b..6943515100f8518cbe3b6f37c4eae8440c6a4e6d 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -838,6 +838,18 @@ STD_ENTRY(interrupt_return)
 .Lresume_userspace:
 	FEEDBACK_REENTER(interrupt_return)
 
+	/*
+	 * Use r33 to hold whether we have already loaded the callee-saves
+	 * into ptregs.  We don't want to do it twice in this loop, since
+	 * then we'd clobber whatever changes are made by ptrace, etc.
+	 * Get base of stack in r32.
+	 */
+	{
+	 GET_THREAD_INFO(r32)
+	 movei  r33, 0
+	}
+
+.Lretry_work_pending:
 	/*
 	 * Disable interrupts so as to make sure we don't
 	 * miss an interrupt that sets any of the thread flags (like
@@ -848,9 +860,6 @@ STD_ENTRY(interrupt_return)
 	IRQ_DISABLE(r20, r21)
 	TRACE_IRQS_OFF  /* Note: clobbers registers r0-r29 */
 
-	/* Get base of stack in r32; note r30/31 are used as arguments here. */
-	GET_THREAD_INFO(r32)
-
 
 	/* Check to see if there is any work to do before returning to user. */
 	{
@@ -866,16 +875,18 @@ STD_ENTRY(interrupt_return)
 
 	/*
 	 * Make sure we have all the registers saved for signal
-	 * handling or single-step.  Call out to C code to figure out
-	 * exactly what we need to do for each flag bit, then if
-	 * necessary, reload the flags and recheck.
+	 * handling, notify-resume, or single-step.  Call out to C
+	 * code to figure out exactly what we need to do for each flag bit,
+	 * then if necessary, reload the flags and recheck.
 	 */
-	push_extra_callee_saves r0
 	{
 	 PTREGS_PTR(r0, PTREGS_OFFSET_BASE)
-	 jal    do_work_pending
+	 bnz    r33, 1f
 	}
-	bnz     r0, .Lresume_userspace
+	push_extra_callee_saves r0
+	movei   r33, 1
+1:	jal     do_work_pending
+	bnz     r0, .Lretry_work_pending
 
 	/*
 	 * In the NMI case we
@@ -1180,10 +1191,12 @@ handle_syscall:
 	add     r20, r20, tp
 	lw      r21, r20
 	addi    r21, r21, 1
-	sw      r20, r21
+	{
+	 sw     r20, r21
+	 GET_THREAD_INFO(r31)
+	}
 
 	/* Trace syscalls, if requested. */
-	GET_THREAD_INFO(r31)
 	addi	r31, r31, THREAD_INFO_FLAGS_OFFSET
 	lw	r30, r31
 	andi    r30, r30, _TIF_SYSCALL_TRACE
@@ -1362,7 +1375,10 @@ handle_ill:
 3:
 	/* set PC and continue */
 	lw      r26, r24
-	sw      r28, r26
+	{
+	 sw     r28, r26
+	 GET_THREAD_INFO(r0)
+	}
 
 	/*
 	 * Clear TIF_SINGLESTEP to prevent recursion if we execute an ill.
@@ -1370,7 +1386,6 @@ handle_ill:
 	 * need to clear it here and can't really impose on all other arches.
 	 * So what's another write between friends?
 	 */
-	GET_THREAD_INFO(r0)
 
 	addi    r1, r0, THREAD_INFO_FLAGS_OFFSET
 	{
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index 49d9d66216822a463a4e905f3425e8eb2a4f77e8..30ae76e50c44e458fc5b87e2d8712de3219383a5 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -646,6 +646,20 @@ STD_ENTRY(interrupt_return)
 .Lresume_userspace:
 	FEEDBACK_REENTER(interrupt_return)
 
+	/*
+	 * Use r33 to hold whether we have already loaded the callee-saves
+	 * into ptregs.  We don't want to do it twice in this loop, since
+	 * then we'd clobber whatever changes are made by ptrace, etc.
+	 */
+	{
+	 movei  r33, 0
+	 move   r32, sp
+	}
+
+	/* Get base of stack in r32. */
+	EXTRACT_THREAD_INFO(r32)
+
+.Lretry_work_pending:
 	/*
 	 * Disable interrupts so as to make sure we don't
 	 * miss an interrupt that sets any of the thread flags (like
@@ -656,9 +670,6 @@ STD_ENTRY(interrupt_return)
 	IRQ_DISABLE(r20, r21)
 	TRACE_IRQS_OFF  /* Note: clobbers registers r0-r29 */
 
-	/* Get base of stack in r32; note r30/31 are used as arguments here. */
-	GET_THREAD_INFO(r32)
-
 
 	/* Check to see if there is any work to do before returning to user. */
 	{
@@ -674,16 +685,18 @@ STD_ENTRY(interrupt_return)
 
 	/*
 	 * Make sure we have all the registers saved for signal
-	 * handling or single-step.  Call out to C code to figure out
+	 * handling or notify-resume.  Call out to C code to figure out
 	 * exactly what we need to do for each flag bit, then if
 	 * necessary, reload the flags and recheck.
 	 */
-	push_extra_callee_saves r0
 	{
 	 PTREGS_PTR(r0, PTREGS_OFFSET_BASE)
-	 jal    do_work_pending
+	 bnez   r33, 1f
 	}
-	bnez    r0, .Lresume_userspace
+	push_extra_callee_saves r0
+	movei   r33, 1
+1:	jal     do_work_pending
+	bnez    r0, .Lretry_work_pending
 
 	/*
 	 * In the NMI case we
@@ -968,11 +981,16 @@ handle_syscall:
 	shl16insli r20, r20, hw0(irq_stat + IRQ_CPUSTAT_SYSCALL_COUNT_OFFSET)
 	add     r20, r20, tp
 	ld4s    r21, r20
-	addi    r21, r21, 1
-	st4     r20, r21
+	{
+	 addi   r21, r21, 1
+	 move   r31, sp
+	}
+	{
+	 st4    r20, r21
+	 EXTRACT_THREAD_INFO(r31)
+	}
 
 	/* Trace syscalls, if requested. */
-	GET_THREAD_INFO(r31)
 	addi	r31, r31, THREAD_INFO_FLAGS_OFFSET
 	ld	r30, r31
 	andi    r30, r30, _TIF_SYSCALL_TRACE
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 2d5ef617bb3906cfcf1f9846af10b9d87e8d53e0..54e6c64b85cc5ee398be55e110b30d98114eb6cb 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -567,6 +567,10 @@ struct task_struct *__sched _switch_to(struct task_struct *prev,
  */
 int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
 {
+	/* If we enter in kernel mode, do nothing and exit the caller loop. */
+	if (!user_mode(regs))
+		return 0;
+
 	if (thread_info_flags & _TIF_NEED_RESCHED) {
 		schedule();
 		return 1;
@@ -589,8 +593,7 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
 		return 1;
 	}
 	if (thread_info_flags & _TIF_SINGLESTEP) {
-		if ((regs->ex1 & SPR_EX_CONTEXT_1_1__PL_MASK) == 0)
-			single_step_once(regs);
+		single_step_once(regs);
 		return 0;
 	}
 	panic("work_pending: bad flags %#x\n", thread_info_flags);