diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index ff2b2154ac1b6c8a0e4bddec0d27319575f47933..786d1a57048b9156396a7f7d5739306097420c60 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -207,6 +207,8 @@ static void __init init_amd(struct cpuinfo_x86 *c)
 		set_bit(X86_FEATURE_K7, c->x86_capability); 
 		break;
 	}
+	if (c->x86 >= 6)
+		set_bit(X86_FEATURE_FXSAVE_LEAK, c->x86_capability);
 
 	display_cacheinfo(c);
 
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 1c44b53cb15bb48391cf4300dbb93fd5fac34835..fb903e65e079a26034f76a30ef6f54b40613453d 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -575,8 +575,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	prev->userrsp = read_pda(oldrsp); 
 	write_pda(oldrsp, next->userrsp); 
 	write_pda(pcurrent, next_p); 
+
 	/* This must be here to ensure both math_state_restore() and
-	   kernel_fpu_begin() work consistently. */
+	   kernel_fpu_begin() work consistently. 
+	   And the AMD workaround requires it to be after DS reload. */
 	unlazy_fpu(prev_p);
 	write_pda(kernelstack,
 		  task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index c50b06765a80ddc16d0d17a59055a8cd42044cea..759070c827511bb482c3b770ff0f35407746936c 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -930,6 +930,10 @@ static int __init init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
 		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
 
+	/* Enable workaround for FXSAVE leak */
+	if (c->x86 >= 6)
+		set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
+
 	r = get_model_name(c);
 	if (!r) { 
 		switch (c->x86) { 
diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h
index 5c0b5876b9318cf7e41af251c367597776330569..b44bfc6239cb51debea374b173e58b5469f8ff1c 100644
--- a/include/asm-i386/cpufeature.h
+++ b/include/asm-i386/cpufeature.h
@@ -71,6 +71,7 @@
 #define X86_FEATURE_P4		(3*32+ 7) /* P4 */
 #define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
 #define X86_FEATURE_UP		(3*32+ 9) /* smp kernel running on up */
+#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
diff --git a/include/asm-i386/i387.h b/include/asm-i386/i387.h
index 152d0baa576a675b39bfa408a6b69f670e8ad118..7b1f01191e7063df3ab056dfdb17e6de0bcd828f 100644
--- a/include/asm-i386/i387.h
+++ b/include/asm-i386/i387.h
@@ -13,6 +13,7 @@
 
 #include <linux/sched.h>
 #include <linux/init.h>
+#include <linux/kernel_stat.h>
 #include <asm/processor.h>
 #include <asm/sigcontext.h>
 #include <asm/user.h>
@@ -38,17 +39,38 @@ extern void init_fpu(struct task_struct *);
 extern void kernel_fpu_begin(void);
 #define kernel_fpu_end() do { stts(); preempt_enable(); } while(0)
 
+/* We need a safe address that is cheap to find and that is already
+   in L1 during context switch. The best choices are unfortunately
+   different for UP and SMP */
+#ifdef CONFIG_SMP
+#define safe_address (__per_cpu_offset[0])
+#else
+#define safe_address (kstat_cpu(0).cpustat.user)
+#endif
+
 /*
  * These must be called with preempt disabled
  */
 static inline void __save_init_fpu( struct task_struct *tsk )
 {
+	/* Use more nops than strictly needed in case the compiler
+	   varies code */
 	alternative_input(
-		"fnsave %1 ; fwait ;" GENERIC_NOP2,
-		"fxsave %1 ; fnclex",
+		"fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
+		"fxsave %[fx]\n"
+		"bt $7,%[fsw] ; jc 1f ; fnclex\n1:",
 		X86_FEATURE_FXSR,
-		"m" (tsk->thread.i387.fxsave)
-		:"memory");
+		[fx] "m" (tsk->thread.i387.fxsave),
+		[fsw] "m" (tsk->thread.i387.fxsave.swd) : "memory");
+	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+	   is pending.  Clear the x87 state here by setting it to fixed
+   	   values. __per_cpu_offset[0] is a random variable that should be in L1 */
+	alternative_input(
+		GENERIC_NOP8 GENERIC_NOP2,
+		"emms\n\t"	  	/* clear stack tags */
+		"fildl %[addr]", 	/* set F?P to defined value */
+		X86_FEATURE_FXSAVE_LEAK,
+		[addr] "m" (safe_address));
 	task_thread_info(tsk)->status &= ~TS_USEDFPU;
 }
 
diff --git a/include/asm-x86_64/cpufeature.h b/include/asm-x86_64/cpufeature.h
index 76bb6193ae9107a380ac6b754d5dd498869eef5f..662964b74e348f228fbb9c874567c574d6339175 100644
--- a/include/asm-x86_64/cpufeature.h
+++ b/include/asm-x86_64/cpufeature.h
@@ -64,6 +64,7 @@
 #define X86_FEATURE_REP_GOOD	(3*32+ 4) /* rep microcode works well on this CPU */
 #define X86_FEATURE_CONSTANT_TSC (3*32+5) /* TSC runs at constant rate */
 #define X86_FEATURE_SYNC_RDTSC  (3*32+6)  /* RDTSC syncs CPU core */
+#define X86_FEATURE_FXSAVE_LEAK (3*32+7)  /* FIP/FOP/FDP leaks through FXSAVE */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
diff --git a/include/asm-x86_64/i387.h b/include/asm-x86_64/i387.h
index 876eb9a2fe7868a7c7ce01ef694403b3150f4601..cba8a3b0cded5a77191d9528710b04c85bc13c5b 100644
--- a/include/asm-x86_64/i387.h
+++ b/include/asm-x86_64/i387.h
@@ -72,6 +72,23 @@ extern int set_fpregs(struct task_struct *tsk,
 #define set_fpu_swd(t,val) ((t)->thread.i387.fxsave.swd = (val))
 #define set_fpu_fxsr_twd(t,val) ((t)->thread.i387.fxsave.twd = (val))
 
+#define X87_FSW_ES (1 << 7)	/* Exception Summary */
+
+/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
+   is pending. Clear the x87 state here by setting it to fixed
+   values. The kernel data segment can be sometimes 0 and sometimes
+   new user value. Both should be ok.
+   Use the PDA as safe address because it should be already in L1. */
+static inline void clear_fpu_state(struct i387_fxsave_struct *fx)
+{
+	if (unlikely(fx->swd & X87_FSW_ES))
+		 asm volatile("fnclex");
+	alternative_input(ASM_NOP8 ASM_NOP2,
+	     	     "    emms\n"		/* clear stack tags */
+	     	     "    fildl %%gs:0",	/* load to clear state */
+		     X86_FEATURE_FXSAVE_LEAK);
+}
+
 static inline int restore_fpu_checking(struct i387_fxsave_struct *fx) 
 { 
 	int err;
@@ -119,6 +136,7 @@ static inline int save_i387_checking(struct i387_fxsave_struct __user *fx)
 #endif
 	if (unlikely(err))
 		__clear_user(fx, sizeof(struct i387_fxsave_struct));
+	/* No need to clear here because the caller clears USED_MATH */
 	return err;
 } 
 
@@ -149,7 +167,7 @@ static inline void __fxsave_clear(struct task_struct *tsk)
 				"i" (offsetof(__typeof__(*tsk),
 					      thread.i387.fxsave)));
 #endif
-	__asm__ __volatile__("fnclex");
+	clear_fpu_state(&tsk->thread.i387.fxsave);
 }
 
 static inline void kernel_fpu_begin(void)