diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 7ee2ae6d5451d120126a04e3deeb07c02f2f9671..70a09f8a0383b2cdc25ab35f6aef43e8ac2905b4 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1633,6 +1633,48 @@ There are some more advanced barrier functions:
      operations" subsection for information on where to use these.
 
 
+ (*) dma_wmb();
+ (*) dma_rmb();
+
+     These are for use with consistent memory to guarantee the ordering
+     of writes or reads of shared memory accessible to both the CPU and a
+     DMA capable device.
+
+     For example, consider a device driver that shares memory with a device
+     and uses a descriptor status value to indicate if the descriptor belongs
+     to the device or the CPU, and a doorbell to notify it when new
+     descriptors are available:
+
+	if (desc->status != DEVICE_OWN) {
+		/* do not read data until we own descriptor */
+		dma_rmb();
+
+		/* read/modify data */
+		read_data = desc->data;
+		desc->data = write_data;
+
+		/* flush modifications before status update */
+		dma_wmb();
+
+		/* assign ownership */
+		desc->status = DEVICE_OWN;
+
+		/* force memory to sync before notifying device via MMIO */
+		wmb();
+
+		/* notify device of new descriptors */
+		writel(DESC_NOTIFY, doorbell);
+	}
+
+     The dma_rmb() allows us guarantee the device has released ownership
+     before we read the data from the descriptor, and he dma_wmb() allows
+     us to guarantee the data is written to the descriptor before the device
+     can see it now has ownership.  The wmb() is needed to guarantee that the
+     cache coherent memory writes have completed before attempting a write to
+     the cache incoherent MMIO region.
+
+     See Documentation/DMA-API.txt for more information on consistent memory.
+
 MMIO WRITE BARRIER
 ------------------
 
diff --git a/arch/alpha/include/asm/barrier.h b/arch/alpha/include/asm/barrier.h
index 3832bdb794fec83d0df010413b009a311b57fec4..77516c87255d6e7abc909f827b930c4bbb154756 100644
--- a/arch/alpha/include/asm/barrier.h
+++ b/arch/alpha/include/asm/barrier.h
@@ -7,6 +7,57 @@
 #define rmb()	__asm__ __volatile__("mb": : :"memory")
 #define wmb()	__asm__ __volatile__("wmb": : :"memory")
 
+/**
+ * read_barrier_depends - Flush all pending reads that subsequents reads
+ * depend on.
+ *
+ * No data-dependent reads from memory-like regions are ever reordered
+ * over this barrier.  All reads preceding this primitive are guaranteed
+ * to access memory (but not necessarily other CPUs' caches) before any
+ * reads following this primitive that depend on the data return by
+ * any of the preceding reads.  This primitive is much lighter weight than
+ * rmb() on most CPUs, and is never heavier weight than is
+ * rmb().
+ *
+ * These ordering constraints are respected by both the local CPU
+ * and the compiler.
+ *
+ * Ordering is not guaranteed by anything other than these primitives,
+ * not even by data dependencies.  See the documentation for
+ * memory_barrier() for examples and URLs to more information.
+ *
+ * For example, the following code would force ordering (the initial
+ * value of "a" is zero, "b" is one, and "p" is "&a"):
+ *
+ * <programlisting>
+ *	CPU 0				CPU 1
+ *
+ *	b = 2;
+ *	memory_barrier();
+ *	p = &b;				q = p;
+ *					read_barrier_depends();
+ *					d = *q;
+ * </programlisting>
+ *
+ * because the read of "*q" depends on the read of "p" and these
+ * two reads are separated by a read_barrier_depends().  However,
+ * the following code, with the same initial values for "a" and "b":
+ *
+ * <programlisting>
+ *	CPU 0				CPU 1
+ *
+ *	a = 2;
+ *	memory_barrier();
+ *	b = 3;				y = b;
+ *					read_barrier_depends();
+ *					x = a;
+ * </programlisting>
+ *
+ * does not enforce ordering, since there is no data dependency between
+ * the read of "a" and the read of "b".  Therefore, on some CPUs, such
+ * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
+ * in cases like this where there are no data dependencies.
+ */
 #define read_barrier_depends() __asm__ __volatile__("mb": : :"memory")
 
 #ifdef CONFIG_SMP
diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h
index c6a3e73a6e2407215f3f9859212e41097071f492..d2f81e6b8c1cc5adb914ce38a7ab991b25801137 100644
--- a/arch/arm/include/asm/barrier.h
+++ b/arch/arm/include/asm/barrier.h
@@ -43,10 +43,14 @@
 #define mb()		do { dsb(); outer_sync(); } while (0)
 #define rmb()		dsb()
 #define wmb()		do { dsb(st); outer_sync(); } while (0)
+#define dma_rmb()	dmb(osh)
+#define dma_wmb()	dmb(oshst)
 #else
 #define mb()		barrier()
 #define rmb()		barrier()
 #define wmb()		barrier()
+#define dma_rmb()	barrier()
+#define dma_wmb()	barrier()
 #endif
 
 #ifndef CONFIG_SMP
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 6389d60574d9c320f69744b28a6051108f0dfff5..a5abb0062d6e943d67ca4ff5479e887194482367 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -32,6 +32,9 @@
 #define rmb()		dsb(ld)
 #define wmb()		dsb(st)
 
+#define dma_rmb()	dmb(oshld)
+#define dma_wmb()	dmb(oshst)
+
 #ifndef CONFIG_SMP
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
diff --git a/arch/blackfin/include/asm/barrier.h b/arch/blackfin/include/asm/barrier.h
index 420006877998ffc82a8de4300ec6bca859b2e560..dfb66fe88b34542e6e9286593e6284fd73a8ece2 100644
--- a/arch/blackfin/include/asm/barrier.h
+++ b/arch/blackfin/include/asm/barrier.h
@@ -22,6 +22,57 @@
 # define mb()	do { barrier(); smp_check_barrier(); smp_mark_barrier(); } while (0)
 # define rmb()	do { barrier(); smp_check_barrier(); } while (0)
 # define wmb()	do { barrier(); smp_mark_barrier(); } while (0)
+/*
+ * read_barrier_depends - Flush all pending reads that subsequents reads
+ * depend on.
+ *
+ * No data-dependent reads from memory-like regions are ever reordered
+ * over this barrier.  All reads preceding this primitive are guaranteed
+ * to access memory (but not necessarily other CPUs' caches) before any
+ * reads following this primitive that depend on the data return by
+ * any of the preceding reads.  This primitive is much lighter weight than
+ * rmb() on most CPUs, and is never heavier weight than is
+ * rmb().
+ *
+ * These ordering constraints are respected by both the local CPU
+ * and the compiler.
+ *
+ * Ordering is not guaranteed by anything other than these primitives,
+ * not even by data dependencies.  See the documentation for
+ * memory_barrier() for examples and URLs to more information.
+ *
+ * For example, the following code would force ordering (the initial
+ * value of "a" is zero, "b" is one, and "p" is "&a"):
+ *
+ * <programlisting>
+ *	CPU 0				CPU 1
+ *
+ *	b = 2;
+ *	memory_barrier();
+ *	p = &b;				q = p;
+ *					read_barrier_depends();
+ *					d = *q;
+ * </programlisting>
+ *
+ * because the read of "*q" depends on the read of "p" and these
+ * two reads are separated by a read_barrier_depends().  However,
+ * the following code, with the same initial values for "a" and "b":
+ *
+ * <programlisting>
+ *	CPU 0				CPU 1
+ *
+ *	a = 2;
+ *	memory_barrier();
+ *	b = 3;				y = b;
+ *					read_barrier_depends();
+ *					x = a;
+ * </programlisting>
+ *
+ * does not enforce ordering, since there is no data dependency between
+ * the read of "a" and the read of "b".  Therefore, on some CPUs, such
+ * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
+ * in cases like this where there are no data dependencies.
+ */
 # define read_barrier_depends()	do { barrier(); smp_check_barrier(); } while (0)
 #endif
 
diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h
index a48957c7b445a2ad08a22d6bc7911b5d2d6049f0..f6769eb2bbf9b5ff8774017ac0eb332fcc7309d9 100644
--- a/arch/ia64/include/asm/barrier.h
+++ b/arch/ia64/include/asm/barrier.h
@@ -35,26 +35,25 @@
  * it's (presumably) much slower than mf and (b) mf.a is supported for
  * sequential memory pages only.
  */
-#define mb()	ia64_mf()
-#define rmb()	mb()
-#define wmb()	mb()
-#define read_barrier_depends()	do { } while(0)
+#define mb()		ia64_mf()
+#define rmb()		mb()
+#define wmb()		mb()
+
+#define dma_rmb()	mb()
+#define dma_wmb()	mb()
 
 #ifdef CONFIG_SMP
 # define smp_mb()	mb()
-# define smp_rmb()	rmb()
-# define smp_wmb()	wmb()
-# define smp_read_barrier_depends()	read_barrier_depends()
-
 #else
-
 # define smp_mb()	barrier()
-# define smp_rmb()	barrier()
-# define smp_wmb()	barrier()
-# define smp_read_barrier_depends()	do { } while(0)
-
 #endif
 
+#define smp_rmb()	smp_mb()
+#define smp_wmb()	smp_mb()
+
+#define read_barrier_depends()		do { } while (0)
+#define smp_read_barrier_depends()	do { } while (0)
+
 #define smp_mb__before_atomic()	barrier()
 #define smp_mb__after_atomic()	barrier()
 
diff --git a/arch/metag/include/asm/barrier.h b/arch/metag/include/asm/barrier.h
index c7591e80067c0e9fc150872262d9375a0faad6e6..d703d8e26a656c1560202d3c1af6a39ed95771e5 100644
--- a/arch/metag/include/asm/barrier.h
+++ b/arch/metag/include/asm/barrier.h
@@ -4,8 +4,6 @@
 #include <asm/metag_mem.h>
 
 #define nop()		asm volatile ("NOP")
-#define mb()		wmb()
-#define rmb()		barrier()
 
 #ifdef CONFIG_METAG_META21
 
@@ -41,13 +39,13 @@ static inline void wr_fence(void)
 
 #endif /* !CONFIG_METAG_META21 */
 
-static inline void wmb(void)
-{
-	/* flush writes through the write combiner */
-	wr_fence();
-}
+/* flush writes through the write combiner */
+#define mb()		wr_fence()
+#define rmb()		barrier()
+#define wmb()		mb()
 
-#define read_barrier_depends()  do { } while (0)
+#define dma_rmb()	rmb()
+#define dma_wmb()	wmb()
 
 #ifndef CONFIG_SMP
 #define fence()		do { } while (0)
@@ -82,7 +80,10 @@ static inline void fence(void)
 #define smp_wmb()       barrier()
 #endif
 #endif
-#define smp_read_barrier_depends()     do { } while (0)
+
+#define read_barrier_depends()		do { } while (0)
+#define smp_read_barrier_depends()	do { } while (0)
+
 #define set_mb(var, value) do { var = value; smp_mb(); } while (0)
 
 #define smp_store_release(p, v)						\
diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h
index d0101dd0575eecce92dbda6f03b6daada470fb47..2b8bbbcb9be0e9f8c5c6f7dbc1a2c6f692185443 100644
--- a/arch/mips/include/asm/barrier.h
+++ b/arch/mips/include/asm/barrier.h
@@ -10,58 +10,6 @@
 
 #include <asm/addrspace.h>
 
-/*
- * read_barrier_depends - Flush all pending reads that subsequents reads
- * depend on.
- *
- * No data-dependent reads from memory-like regions are ever reordered
- * over this barrier.  All reads preceding this primitive are guaranteed
- * to access memory (but not necessarily other CPUs' caches) before any
- * reads following this primitive that depend on the data return by
- * any of the preceding reads.  This primitive is much lighter weight than
- * rmb() on most CPUs, and is never heavier weight than is
- * rmb().
- *
- * These ordering constraints are respected by both the local CPU
- * and the compiler.
- *
- * Ordering is not guaranteed by anything other than these primitives,
- * not even by data dependencies.  See the documentation for
- * memory_barrier() for examples and URLs to more information.
- *
- * For example, the following code would force ordering (the initial
- * value of "a" is zero, "b" is one, and "p" is "&a"):
- *
- * <programlisting>
- *	CPU 0				CPU 1
- *
- *	b = 2;
- *	memory_barrier();
- *	p = &b;				q = p;
- *					read_barrier_depends();
- *					d = *q;
- * </programlisting>
- *
- * because the read of "*q" depends on the read of "p" and these
- * two reads are separated by a read_barrier_depends().  However,
- * the following code, with the same initial values for "a" and "b":
- *
- * <programlisting>
- *	CPU 0				CPU 1
- *
- *	a = 2;
- *	memory_barrier();
- *	b = 3;				y = b;
- *					read_barrier_depends();
- *					x = a;
- * </programlisting>
- *
- * does not enforce ordering, since there is no data dependency between
- * the read of "a" and the read of "b".  Therefore, on some CPUs, such
- * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
- * in cases like this where there are no data dependencies.
- */
-
 #define read_barrier_depends()		do { } while(0)
 #define smp_read_barrier_depends()	do { } while(0)
 
@@ -127,20 +75,21 @@
 
 #include <asm/wbflush.h>
 
-#define wmb()		fast_wmb()
-#define rmb()		fast_rmb()
 #define mb()		wbflush()
 #define iob()		wbflush()
 
 #else /* !CONFIG_CPU_HAS_WB */
 
-#define wmb()		fast_wmb()
-#define rmb()		fast_rmb()
 #define mb()		fast_mb()
 #define iob()		fast_iob()
 
 #endif /* !CONFIG_CPU_HAS_WB */
 
+#define wmb()		fast_wmb()
+#define rmb()		fast_rmb()
+#define dma_wmb()	fast_wmb()
+#define dma_rmb()	fast_rmb()
+
 #if defined(CONFIG_WEAK_ORDERING) && defined(CONFIG_SMP)
 # ifdef CONFIG_CPU_CAVIUM_OCTEON
 #  define smp_mb()	__sync()
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index bab79a110c7b05ceb8a4320efe4290257e186250..a3bf5be111ff1d073eb329476ce773c07ce32ed5 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -33,12 +33,9 @@
 #define mb()   __asm__ __volatile__ ("sync" : : : "memory")
 #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
 #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
-#define read_barrier_depends()  do { } while(0)
 
 #define set_mb(var, value)	do { var = value; mb(); } while (0)
 
-#ifdef CONFIG_SMP
-
 #ifdef __SUBARCH_HAS_LWSYNC
 #    define SMPWMB      LWSYNC
 #else
@@ -46,20 +43,26 @@
 #endif
 
 #define __lwsync()	__asm__ __volatile__ (stringify_in_c(LWSYNC) : : :"memory")
+#define dma_rmb()	__lwsync()
+#define dma_wmb()	__asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory")
+
+#ifdef CONFIG_SMP
+#define smp_lwsync()	__lwsync()
 
 #define smp_mb()	mb()
 #define smp_rmb()	__lwsync()
 #define smp_wmb()	__asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory")
-#define smp_read_barrier_depends()	read_barrier_depends()
 #else
-#define __lwsync()	barrier()
+#define smp_lwsync()	barrier()
 
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
-#define smp_read_barrier_depends()	do { } while(0)
 #endif /* CONFIG_SMP */
 
+#define read_barrier_depends()		do { } while (0)
+#define smp_read_barrier_depends()	do { } while (0)
+
 /*
  * This is a barrier which prevents following instructions from being
  * started until the value of the argument x is known.  For example, if
@@ -72,7 +75,7 @@
 #define smp_store_release(p, v)						\
 do {									\
 	compiletime_assert_atomic_type(*p);				\
-	__lwsync();							\
+	smp_lwsync();							\
 	ACCESS_ONCE(*p) = (v);						\
 } while (0)
 
@@ -80,7 +83,7 @@ do {									\
 ({									\
 	typeof(*p) ___p1 = ACCESS_ONCE(*p);				\
 	compiletime_assert_atomic_type(*p);				\
-	__lwsync();							\
+	smp_lwsync();							\
 	___p1;								\
 })
 
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index b5dce6544d76e3357e4e8629333817008a0ee183..8d724718ec21c8d0331e1b8fbc14c96cb5892649 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -24,11 +24,14 @@
 
 #define rmb()				mb()
 #define wmb()				mb()
-#define read_barrier_depends()		do { } while(0)
+#define dma_rmb()			rmb()
+#define dma_wmb()			wmb()
 #define smp_mb()			mb()
 #define smp_rmb()			rmb()
 #define smp_wmb()			wmb()
-#define smp_read_barrier_depends()	read_barrier_depends()
+
+#define read_barrier_depends()		do { } while (0)
+#define smp_read_barrier_depends()	do { } while (0)
 
 #define smp_mb__before_atomic()		smp_mb()
 #define smp_mb__after_atomic()		smp_mb()
diff --git a/arch/sparc/include/asm/barrier_64.h b/arch/sparc/include/asm/barrier_64.h
index 305dcc3dc721261b3a84fd402acd247956b60a08..76648941fea71b4327e058d6e995a3331587541c 100644
--- a/arch/sparc/include/asm/barrier_64.h
+++ b/arch/sparc/include/asm/barrier_64.h
@@ -37,7 +37,9 @@ do {	__asm__ __volatile__("ba,pt	%%xcc, 1f\n\t" \
 #define rmb()	__asm__ __volatile__("":::"memory")
 #define wmb()	__asm__ __volatile__("":::"memory")
 
-#define read_barrier_depends()		do { } while(0)
+#define dma_rmb()	rmb()
+#define dma_wmb()	wmb()
+
 #define set_mb(__var, __value) \
 	do { __var = __value; membar_safe("#StoreLoad"); } while(0)
 
@@ -51,7 +53,8 @@ do {	__asm__ __volatile__("ba,pt	%%xcc, 1f\n\t" \
 #define smp_wmb()	__asm__ __volatile__("":::"memory")
 #endif
 
-#define smp_read_barrier_depends()	do { } while(0)
+#define read_barrier_depends()		do { } while (0)
+#define smp_read_barrier_depends()	do { } while (0)
 
 #define smp_store_release(p, v)						\
 do {									\
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 0f4460b5636ded85a3e9ad4d6d755dbe32b745eb..2ab1eb33106eec42eff90d27b98cb698b5c4c835 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -24,78 +24,28 @@
 #define wmb()	asm volatile("sfence" ::: "memory")
 #endif
 
-/**
- * read_barrier_depends - Flush all pending reads that subsequents reads
- * depend on.
- *
- * No data-dependent reads from memory-like regions are ever reordered
- * over this barrier.  All reads preceding this primitive are guaranteed
- * to access memory (but not necessarily other CPUs' caches) before any
- * reads following this primitive that depend on the data return by
- * any of the preceding reads.  This primitive is much lighter weight than
- * rmb() on most CPUs, and is never heavier weight than is
- * rmb().
- *
- * These ordering constraints are respected by both the local CPU
- * and the compiler.
- *
- * Ordering is not guaranteed by anything other than these primitives,
- * not even by data dependencies.  See the documentation for
- * memory_barrier() for examples and URLs to more information.
- *
- * For example, the following code would force ordering (the initial
- * value of "a" is zero, "b" is one, and "p" is "&a"):
- *
- * <programlisting>
- *	CPU 0				CPU 1
- *
- *	b = 2;
- *	memory_barrier();
- *	p = &b;				q = p;
- *					read_barrier_depends();
- *					d = *q;
- * </programlisting>
- *
- * because the read of "*q" depends on the read of "p" and these
- * two reads are separated by a read_barrier_depends().  However,
- * the following code, with the same initial values for "a" and "b":
- *
- * <programlisting>
- *	CPU 0				CPU 1
- *
- *	a = 2;
- *	memory_barrier();
- *	b = 3;				y = b;
- *					read_barrier_depends();
- *					x = a;
- * </programlisting>
- *
- * does not enforce ordering, since there is no data dependency between
- * the read of "a" and the read of "b".  Therefore, on some CPUs, such
- * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
- * in cases like this where there are no data dependencies.
- **/
-
-#define read_barrier_depends()	do { } while (0)
-
-#ifdef CONFIG_SMP
-#define smp_mb()	mb()
 #ifdef CONFIG_X86_PPRO_FENCE
-# define smp_rmb()	rmb()
+#define dma_rmb()	rmb()
 #else
-# define smp_rmb()	barrier()
+#define dma_rmb()	barrier()
 #endif
+#define dma_wmb()	barrier()
+
+#ifdef CONFIG_SMP
+#define smp_mb()	mb()
+#define smp_rmb()	dma_rmb()
 #define smp_wmb()	barrier()
-#define smp_read_barrier_depends()	read_barrier_depends()
 #define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
 #else /* !SMP */
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
-#define smp_read_barrier_depends()	do { } while (0)
 #define set_mb(var, value) do { var = value; barrier(); } while (0)
 #endif /* SMP */
 
+#define read_barrier_depends()		do { } while (0)
+#define smp_read_barrier_depends()	do { } while (0)
+
 #if defined(CONFIG_X86_PPRO_FENCE)
 
 /*
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index cc04e67bfd0589966e1c4b4b2fa29079c439ae24..2d7d9a1f5b531ee399b7a0e6a1f9eb4a190fd820 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -29,20 +29,18 @@
 
 #endif /* CONFIG_X86_32 */
 
-#define read_barrier_depends()	do { } while (0)
-
-#ifdef CONFIG_SMP
-
-#define smp_mb()	mb()
 #ifdef CONFIG_X86_PPRO_FENCE
-#define smp_rmb()	rmb()
+#define dma_rmb()	rmb()
 #else /* CONFIG_X86_PPRO_FENCE */
-#define smp_rmb()	barrier()
+#define dma_rmb()	barrier()
 #endif /* CONFIG_X86_PPRO_FENCE */
+#define dma_wmb()	barrier()
 
-#define smp_wmb()	barrier()
+#ifdef CONFIG_SMP
 
-#define smp_read_barrier_depends()	read_barrier_depends()
+#define smp_mb()	mb()
+#define smp_rmb()	dma_rmb()
+#define smp_wmb()	barrier()
 #define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
 
 #else /* CONFIG_SMP */
@@ -50,11 +48,13 @@
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
-#define smp_read_barrier_depends()	do { } while (0)
 #define set_mb(var, value) do { var = value; barrier(); } while (0)
 
 #endif /* CONFIG_SMP */
 
+#define read_barrier_depends()		do { } while (0)
+#define smp_read_barrier_depends()	do { } while (0)
+
 /*
  * Stop RDTSC speculation. This is needed when you need to use RDTSC
  * (or get_cycles or vread that possibly accesses the TSC) in a defined
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 4f4c2a7888e5d74ee06ae58df8feaf5f1dea3123..feb29c4526f7eea698ef8568b784daafe467980d 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -684,10 +684,9 @@ static void bcm_sf2_sw_fixed_link_update(struct dsa_switch *ds, int port,
 					 struct fixed_phy_status *status)
 {
 	struct bcm_sf2_priv *priv = ds_to_priv(ds);
-	u32 link, duplex, pause, speed;
+	u32 duplex, pause, speed;
 	u32 reg;
 
-	link = core_readl(priv, CORE_LNKSTS);
 	duplex = core_readl(priv, CORE_DUPSTS);
 	pause = core_readl(priv, CORE_PAUSESTS);
 	speed = core_readl(priv, CORE_SPDSTS);
@@ -701,22 +700,26 @@ static void bcm_sf2_sw_fixed_link_update(struct dsa_switch *ds, int port,
 	 * which means that we need to force the link at the port override
 	 * level to get the data to flow. We do use what the interrupt handler
 	 * did determine before.
+	 *
+	 * For the other ports, we just force the link status, since this is
+	 * a fixed PHY device.
 	 */
 	if (port == 7) {
 		status->link = priv->port_sts[port].link;
-		reg = core_readl(priv, CORE_STS_OVERRIDE_GMIIP_PORT(7));
-		reg |= SW_OVERRIDE;
-		if (status->link)
-			reg |= LINK_STS;
-		else
-			reg &= ~LINK_STS;
-		core_writel(priv, reg, CORE_STS_OVERRIDE_GMIIP_PORT(7));
 		status->duplex = 1;
 	} else {
-		status->link = !!(link & (1 << port));
+		status->link = 1;
 		status->duplex = !!(duplex & (1 << port));
 	}
 
+	reg = core_readl(priv, CORE_STS_OVERRIDE_GMIIP_PORT(port));
+	reg |= SW_OVERRIDE;
+	if (status->link)
+		reg |= LINK_STS;
+	else
+		reg &= ~LINK_STS;
+	core_writel(priv, reg, CORE_STS_OVERRIDE_GMIIP_PORT(port));
+
 	switch (speed) {
 	case SPDSTS_10:
 		status->speed = SPEED_10;
diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index b6bc318b148e34a0e502ab1dd1a8c3fc88583cc3..0987d2a77f9f70ebd8c22ace98964093b40d0ebd 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -66,23 +66,25 @@ static unsigned int macb_tx_ring_wrap(unsigned int index)
 	return index & (TX_RING_SIZE - 1);
 }
 
-static struct macb_dma_desc *macb_tx_desc(struct macb *bp, unsigned int index)
+static struct macb_dma_desc *macb_tx_desc(struct macb_queue *queue,
+					  unsigned int index)
 {
-	return &bp->tx_ring[macb_tx_ring_wrap(index)];
+	return &queue->tx_ring[macb_tx_ring_wrap(index)];
 }
 
-static struct macb_tx_skb *macb_tx_skb(struct macb *bp, unsigned int index)
+static struct macb_tx_skb *macb_tx_skb(struct macb_queue *queue,
+				       unsigned int index)
 {
-	return &bp->tx_skb[macb_tx_ring_wrap(index)];
+	return &queue->tx_skb[macb_tx_ring_wrap(index)];
 }
 
-static dma_addr_t macb_tx_dma(struct macb *bp, unsigned int index)
+static dma_addr_t macb_tx_dma(struct macb_queue *queue, unsigned int index)
 {
 	dma_addr_t offset;
 
 	offset = macb_tx_ring_wrap(index) * sizeof(struct macb_dma_desc);
 
-	return bp->tx_ring_dma + offset;
+	return queue->tx_ring_dma + offset;
 }
 
 static unsigned int macb_rx_ring_wrap(unsigned int index)
@@ -490,38 +492,49 @@ static void macb_tx_unmap(struct macb *bp, struct macb_tx_skb *tx_skb)
 
 static void macb_tx_error_task(struct work_struct *work)
 {
-	struct macb	*bp = container_of(work, struct macb, tx_error_task);
+	struct macb_queue	*queue = container_of(work, struct macb_queue,
+						      tx_error_task);
+	struct macb		*bp = queue->bp;
 	struct macb_tx_skb	*tx_skb;
+	struct macb_dma_desc	*desc;
 	struct sk_buff		*skb;
 	unsigned int		tail;
+	unsigned long		flags;
+
+	netdev_vdbg(bp->dev, "macb_tx_error_task: q = %u, t = %u, h = %u\n",
+		    (unsigned int)(queue - bp->queues),
+		    queue->tx_tail, queue->tx_head);
 
-	netdev_vdbg(bp->dev, "macb_tx_error_task: t = %u, h = %u\n",
-		    bp->tx_tail, bp->tx_head);
+	/* Prevent the queue IRQ handlers from running: each of them may call
+	 * macb_tx_interrupt(), which in turn may call netif_wake_subqueue().
+	 * As explained below, we have to halt the transmission before updating
+	 * TBQP registers so we call netif_tx_stop_all_queues() to notify the
+	 * network engine about the macb/gem being halted.
+	 */
+	spin_lock_irqsave(&bp->lock, flags);
 
 	/* Make sure nobody is trying to queue up new packets */
-	netif_stop_queue(bp->dev);
+	netif_tx_stop_all_queues(bp->dev);
 
 	/*
 	 * Stop transmission now
 	 * (in case we have just queued new packets)
+	 * macb/gem must be halted to write TBQP register
 	 */
 	if (macb_halt_tx(bp))
 		/* Just complain for now, reinitializing TX path can be good */
 		netdev_err(bp->dev, "BUG: halt tx timed out\n");
 
-	/* No need for the lock here as nobody will interrupt us anymore */
-
 	/*
 	 * Treat frames in TX queue including the ones that caused the error.
 	 * Free transmit buffers in upper layer.
 	 */
-	for (tail = bp->tx_tail; tail != bp->tx_head; tail++) {
-		struct macb_dma_desc	*desc;
-		u32			ctrl;
+	for (tail = queue->tx_tail; tail != queue->tx_head; tail++) {
+		u32	ctrl;
 
-		desc = macb_tx_desc(bp, tail);
+		desc = macb_tx_desc(queue, tail);
 		ctrl = desc->ctrl;
-		tx_skb = macb_tx_skb(bp, tail);
+		tx_skb = macb_tx_skb(queue, tail);
 		skb = tx_skb->skb;
 
 		if (ctrl & MACB_BIT(TX_USED)) {
@@ -529,7 +542,7 @@ static void macb_tx_error_task(struct work_struct *work)
 			while (!skb) {
 				macb_tx_unmap(bp, tx_skb);
 				tail++;
-				tx_skb = macb_tx_skb(bp, tail);
+				tx_skb = macb_tx_skb(queue, tail);
 				skb = tx_skb->skb;
 			}
 
@@ -558,45 +571,56 @@ static void macb_tx_error_task(struct work_struct *work)
 		macb_tx_unmap(bp, tx_skb);
 	}
 
+	/* Set end of TX queue */
+	desc = macb_tx_desc(queue, 0);
+	desc->addr = 0;
+	desc->ctrl = MACB_BIT(TX_USED);
+
 	/* Make descriptor updates visible to hardware */
 	wmb();
 
 	/* Reinitialize the TX desc queue */
-	macb_writel(bp, TBQP, bp->tx_ring_dma);
+	queue_writel(queue, TBQP, queue->tx_ring_dma);
 	/* Make TX ring reflect state of hardware */
-	bp->tx_head = bp->tx_tail = 0;
-
-	/* Now we are ready to start transmission again */
-	netif_wake_queue(bp->dev);
+	queue->tx_head = 0;
+	queue->tx_tail = 0;
 
 	/* Housework before enabling TX IRQ */
 	macb_writel(bp, TSR, macb_readl(bp, TSR));
-	macb_writel(bp, IER, MACB_TX_INT_FLAGS);
+	queue_writel(queue, IER, MACB_TX_INT_FLAGS);
+
+	/* Now we are ready to start transmission again */
+	netif_tx_start_all_queues(bp->dev);
+	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
+
+	spin_unlock_irqrestore(&bp->lock, flags);
 }
 
-static void macb_tx_interrupt(struct macb *bp)
+static void macb_tx_interrupt(struct macb_queue *queue)
 {
 	unsigned int tail;
 	unsigned int head;
 	u32 status;
+	struct macb *bp = queue->bp;
+	u16 queue_index = queue - bp->queues;
 
 	status = macb_readl(bp, TSR);
 	macb_writel(bp, TSR, status);
 
 	if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
-		macb_writel(bp, ISR, MACB_BIT(TCOMP));
+		queue_writel(queue, ISR, MACB_BIT(TCOMP));
 
 	netdev_vdbg(bp->dev, "macb_tx_interrupt status = 0x%03lx\n",
 		(unsigned long)status);
 
-	head = bp->tx_head;
-	for (tail = bp->tx_tail; tail != head; tail++) {
+	head = queue->tx_head;
+	for (tail = queue->tx_tail; tail != head; tail++) {
 		struct macb_tx_skb	*tx_skb;
 		struct sk_buff		*skb;
 		struct macb_dma_desc	*desc;
 		u32			ctrl;
 
-		desc = macb_tx_desc(bp, tail);
+		desc = macb_tx_desc(queue, tail);
 
 		/* Make hw descriptor updates visible to CPU */
 		rmb();
@@ -611,7 +635,7 @@ static void macb_tx_interrupt(struct macb *bp)
 
 		/* Process all buffers of the current transmitted frame */
 		for (;; tail++) {
-			tx_skb = macb_tx_skb(bp, tail);
+			tx_skb = macb_tx_skb(queue, tail);
 			skb = tx_skb->skb;
 
 			/* First, update TX stats if needed */
@@ -634,11 +658,11 @@ static void macb_tx_interrupt(struct macb *bp)
 		}
 	}
 
-	bp->tx_tail = tail;
-	if (netif_queue_stopped(bp->dev)
-			&& CIRC_CNT(bp->tx_head, bp->tx_tail,
-				    TX_RING_SIZE) <= MACB_TX_WAKEUP_THRESH)
-		netif_wake_queue(bp->dev);
+	queue->tx_tail = tail;
+	if (__netif_subqueue_stopped(bp->dev, queue_index) &&
+	    CIRC_CNT(queue->tx_head, queue->tx_tail,
+		     TX_RING_SIZE) <= MACB_TX_WAKEUP_THRESH)
+		netif_wake_subqueue(bp->dev, queue_index);
 }
 
 static void gem_rx_refill(struct macb *bp)
@@ -949,11 +973,12 @@ static int macb_poll(struct napi_struct *napi, int budget)
 
 static irqreturn_t macb_interrupt(int irq, void *dev_id)
 {
-	struct net_device *dev = dev_id;
-	struct macb *bp = netdev_priv(dev);
+	struct macb_queue *queue = dev_id;
+	struct macb *bp = queue->bp;
+	struct net_device *dev = bp->dev;
 	u32 status;
 
-	status = macb_readl(bp, ISR);
+	status = queue_readl(queue, ISR);
 
 	if (unlikely(!status))
 		return IRQ_NONE;
@@ -963,11 +988,13 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 	while (status) {
 		/* close possible race with dev_close */
 		if (unlikely(!netif_running(dev))) {
-			macb_writel(bp, IDR, -1);
+			queue_writel(queue, IDR, -1);
 			break;
 		}
 
-		netdev_vdbg(bp->dev, "isr = 0x%08lx\n", (unsigned long)status);
+		netdev_vdbg(bp->dev, "queue = %u, isr = 0x%08lx\n",
+			    (unsigned int)(queue - bp->queues),
+			    (unsigned long)status);
 
 		if (status & MACB_RX_INT_FLAGS) {
 			/*
@@ -977,9 +1004,9 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 			 * is already scheduled, so disable interrupts
 			 * now.
 			 */
-			macb_writel(bp, IDR, MACB_RX_INT_FLAGS);
+			queue_writel(queue, IDR, MACB_RX_INT_FLAGS);
 			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
-				macb_writel(bp, ISR, MACB_BIT(RCOMP));
+				queue_writel(queue, ISR, MACB_BIT(RCOMP));
 
 			if (napi_schedule_prep(&bp->napi)) {
 				netdev_vdbg(bp->dev, "scheduling RX softirq\n");
@@ -988,17 +1015,17 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 		}
 
 		if (unlikely(status & (MACB_TX_ERR_FLAGS))) {
-			macb_writel(bp, IDR, MACB_TX_INT_FLAGS);
-			schedule_work(&bp->tx_error_task);
+			queue_writel(queue, IDR, MACB_TX_INT_FLAGS);
+			schedule_work(&queue->tx_error_task);
 
 			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
-				macb_writel(bp, ISR, MACB_TX_ERR_FLAGS);
+				queue_writel(queue, ISR, MACB_TX_ERR_FLAGS);
 
 			break;
 		}
 
 		if (status & MACB_BIT(TCOMP))
-			macb_tx_interrupt(bp);
+			macb_tx_interrupt(queue);
 
 		/*
 		 * Link change detection isn't possible with RMII, so we'll
@@ -1013,7 +1040,7 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 				bp->hw_stats.macb.rx_overruns++;
 
 			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
-				macb_writel(bp, ISR, MACB_BIT(ISR_ROVR));
+				queue_writel(queue, ISR, MACB_BIT(ISR_ROVR));
 		}
 
 		if (status & MACB_BIT(HRESP)) {
@@ -1025,10 +1052,10 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 			netdev_err(dev, "DMA bus error: HRESP not OK\n");
 
 			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
-				macb_writel(bp, ISR, MACB_BIT(HRESP));
+				queue_writel(queue, ISR, MACB_BIT(HRESP));
 		}
 
-		status = macb_readl(bp, ISR);
+		status = queue_readl(queue, ISR);
 	}
 
 	spin_unlock(&bp->lock);
@@ -1043,10 +1070,14 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
  */
 static void macb_poll_controller(struct net_device *dev)
 {
+	struct macb *bp = netdev_priv(dev);
+	struct macb_queue *queue;
 	unsigned long flags;
+	unsigned int q;
 
 	local_irq_save(flags);
-	macb_interrupt(dev->irq, dev);
+	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue)
+		macb_interrupt(dev->irq, queue);
 	local_irq_restore(flags);
 }
 #endif
@@ -1058,10 +1089,11 @@ static inline unsigned int macb_count_tx_descriptors(struct macb *bp,
 }
 
 static unsigned int macb_tx_map(struct macb *bp,
+				struct macb_queue *queue,
 				struct sk_buff *skb)
 {
 	dma_addr_t mapping;
-	unsigned int len, entry, i, tx_head = bp->tx_head;
+	unsigned int len, entry, i, tx_head = queue->tx_head;
 	struct macb_tx_skb *tx_skb = NULL;
 	struct macb_dma_desc *desc;
 	unsigned int offset, size, count = 0;
@@ -1075,7 +1107,7 @@ static unsigned int macb_tx_map(struct macb *bp,
 	while (len) {
 		size = min(len, bp->max_tx_length);
 		entry = macb_tx_ring_wrap(tx_head);
-		tx_skb = &bp->tx_skb[entry];
+		tx_skb = &queue->tx_skb[entry];
 
 		mapping = dma_map_single(&bp->pdev->dev,
 					 skb->data + offset,
@@ -1104,7 +1136,7 @@ static unsigned int macb_tx_map(struct macb *bp,
 		while (len) {
 			size = min(len, bp->max_tx_length);
 			entry = macb_tx_ring_wrap(tx_head);
-			tx_skb = &bp->tx_skb[entry];
+			tx_skb = &queue->tx_skb[entry];
 
 			mapping = skb_frag_dma_map(&bp->pdev->dev, frag,
 						   offset, size, DMA_TO_DEVICE);
@@ -1143,14 +1175,14 @@ static unsigned int macb_tx_map(struct macb *bp,
 	i = tx_head;
 	entry = macb_tx_ring_wrap(i);
 	ctrl = MACB_BIT(TX_USED);
-	desc = &bp->tx_ring[entry];
+	desc = &queue->tx_ring[entry];
 	desc->ctrl = ctrl;
 
 	do {
 		i--;
 		entry = macb_tx_ring_wrap(i);
-		tx_skb = &bp->tx_skb[entry];
-		desc = &bp->tx_ring[entry];
+		tx_skb = &queue->tx_skb[entry];
+		desc = &queue->tx_ring[entry];
 
 		ctrl = (u32)tx_skb->size;
 		if (eof) {
@@ -1167,17 +1199,17 @@ static unsigned int macb_tx_map(struct macb *bp,
 		 */
 		wmb();
 		desc->ctrl = ctrl;
-	} while (i != bp->tx_head);
+	} while (i != queue->tx_head);
 
-	bp->tx_head = tx_head;
+	queue->tx_head = tx_head;
 
 	return count;
 
 dma_error:
 	netdev_err(bp->dev, "TX DMA map failed\n");
 
-	for (i = bp->tx_head; i != tx_head; i++) {
-		tx_skb = macb_tx_skb(bp, i);
+	for (i = queue->tx_head; i != tx_head; i++) {
+		tx_skb = macb_tx_skb(queue, i);
 
 		macb_tx_unmap(bp, tx_skb);
 	}
@@ -1187,14 +1219,16 @@ static unsigned int macb_tx_map(struct macb *bp,
 
 static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+	u16 queue_index = skb_get_queue_mapping(skb);
 	struct macb *bp = netdev_priv(dev);
+	struct macb_queue *queue = &bp->queues[queue_index];
 	unsigned long flags;
 	unsigned int count, nr_frags, frag_size, f;
 
 #if defined(DEBUG) && defined(VERBOSE_DEBUG)
 	netdev_vdbg(bp->dev,
-		   "start_xmit: len %u head %p data %p tail %p end %p\n",
-		   skb->len, skb->head, skb->data,
+		   "start_xmit: queue %hu len %u head %p data %p tail %p end %p\n",
+		   queue_index, skb->len, skb->head, skb->data,
 		   skb_tail_pointer(skb), skb_end_pointer(skb));
 	print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_OFFSET, 16, 1,
 		       skb->data, 16, true);
@@ -1214,16 +1248,16 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	spin_lock_irqsave(&bp->lock, flags);
 
 	/* This is a hard error, log it. */
-	if (CIRC_SPACE(bp->tx_head, bp->tx_tail, TX_RING_SIZE) < count) {
-		netif_stop_queue(dev);
+	if (CIRC_SPACE(queue->tx_head, queue->tx_tail, TX_RING_SIZE) < count) {
+		netif_stop_subqueue(dev, queue_index);
 		spin_unlock_irqrestore(&bp->lock, flags);
 		netdev_dbg(bp->dev, "tx_head = %u, tx_tail = %u\n",
-			   bp->tx_head, bp->tx_tail);
+			   queue->tx_head, queue->tx_tail);
 		return NETDEV_TX_BUSY;
 	}
 
 	/* Map socket buffer for DMA transfer */
-	if (!macb_tx_map(bp, skb)) {
+	if (!macb_tx_map(bp, queue, skb)) {
 		dev_kfree_skb_any(skb);
 		goto unlock;
 	}
@@ -1235,8 +1269,8 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
 
-	if (CIRC_SPACE(bp->tx_head, bp->tx_tail, TX_RING_SIZE) < 1)
-		netif_stop_queue(dev);
+	if (CIRC_SPACE(queue->tx_head, queue->tx_tail, TX_RING_SIZE) < 1)
+		netif_stop_subqueue(dev, queue_index);
 
 unlock:
 	spin_unlock_irqrestore(&bp->lock, flags);
@@ -1304,20 +1338,24 @@ static void macb_free_rx_buffers(struct macb *bp)
 
 static void macb_free_consistent(struct macb *bp)
 {
-	if (bp->tx_skb) {
-		kfree(bp->tx_skb);
-		bp->tx_skb = NULL;
-	}
+	struct macb_queue *queue;
+	unsigned int q;
+
 	bp->macbgem_ops.mog_free_rx_buffers(bp);
 	if (bp->rx_ring) {
 		dma_free_coherent(&bp->pdev->dev, RX_RING_BYTES,
 				  bp->rx_ring, bp->rx_ring_dma);
 		bp->rx_ring = NULL;
 	}
-	if (bp->tx_ring) {
-		dma_free_coherent(&bp->pdev->dev, TX_RING_BYTES,
-				  bp->tx_ring, bp->tx_ring_dma);
-		bp->tx_ring = NULL;
+
+	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+		kfree(queue->tx_skb);
+		queue->tx_skb = NULL;
+		if (queue->tx_ring) {
+			dma_free_coherent(&bp->pdev->dev, TX_RING_BYTES,
+					  queue->tx_ring, queue->tx_ring_dma);
+			queue->tx_ring = NULL;
+		}
 	}
 }
 
@@ -1354,12 +1392,27 @@ static int macb_alloc_rx_buffers(struct macb *bp)
 
 static int macb_alloc_consistent(struct macb *bp)
 {
+	struct macb_queue *queue;
+	unsigned int q;
 	int size;
 
-	size = TX_RING_SIZE * sizeof(struct macb_tx_skb);
-	bp->tx_skb = kmalloc(size, GFP_KERNEL);
-	if (!bp->tx_skb)
-		goto out_err;
+	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+		size = TX_RING_BYTES;
+		queue->tx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
+						    &queue->tx_ring_dma,
+						    GFP_KERNEL);
+		if (!queue->tx_ring)
+			goto out_err;
+		netdev_dbg(bp->dev,
+			   "Allocated TX ring for queue %u of %d bytes at %08lx (mapped %p)\n",
+			   q, size, (unsigned long)queue->tx_ring_dma,
+			   queue->tx_ring);
+
+		size = TX_RING_SIZE * sizeof(struct macb_tx_skb);
+		queue->tx_skb = kmalloc(size, GFP_KERNEL);
+		if (!queue->tx_skb)
+			goto out_err;
+	}
 
 	size = RX_RING_BYTES;
 	bp->rx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
@@ -1370,15 +1423,6 @@ static int macb_alloc_consistent(struct macb *bp)
 		   "Allocated RX ring of %d bytes at %08lx (mapped %p)\n",
 		   size, (unsigned long)bp->rx_ring_dma, bp->rx_ring);
 
-	size = TX_RING_BYTES;
-	bp->tx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
-					 &bp->tx_ring_dma, GFP_KERNEL);
-	if (!bp->tx_ring)
-		goto out_err;
-	netdev_dbg(bp->dev,
-		   "Allocated TX ring of %d bytes at %08lx (mapped %p)\n",
-		   size, (unsigned long)bp->tx_ring_dma, bp->tx_ring);
-
 	if (bp->macbgem_ops.mog_alloc_rx_buffers(bp))
 		goto out_err;
 
@@ -1391,15 +1435,22 @@ static int macb_alloc_consistent(struct macb *bp)
 
 static void gem_init_rings(struct macb *bp)
 {
+	struct macb_queue *queue;
+	unsigned int q;
 	int i;
 
-	for (i = 0; i < TX_RING_SIZE; i++) {
-		bp->tx_ring[i].addr = 0;
-		bp->tx_ring[i].ctrl = MACB_BIT(TX_USED);
+	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+		for (i = 0; i < TX_RING_SIZE; i++) {
+			queue->tx_ring[i].addr = 0;
+			queue->tx_ring[i].ctrl = MACB_BIT(TX_USED);
+		}
+		queue->tx_ring[TX_RING_SIZE - 1].ctrl |= MACB_BIT(TX_WRAP);
+		queue->tx_head = 0;
+		queue->tx_tail = 0;
 	}
-	bp->tx_ring[TX_RING_SIZE - 1].ctrl |= MACB_BIT(TX_WRAP);
 
-	bp->rx_tail = bp->rx_prepared_head = bp->tx_head = bp->tx_tail = 0;
+	bp->rx_tail = 0;
+	bp->rx_prepared_head = 0;
 
 	gem_rx_refill(bp);
 }
@@ -1418,16 +1469,21 @@ static void macb_init_rings(struct macb *bp)
 	bp->rx_ring[RX_RING_SIZE - 1].addr |= MACB_BIT(RX_WRAP);
 
 	for (i = 0; i < TX_RING_SIZE; i++) {
-		bp->tx_ring[i].addr = 0;
-		bp->tx_ring[i].ctrl = MACB_BIT(TX_USED);
+		bp->queues[0].tx_ring[i].addr = 0;
+		bp->queues[0].tx_ring[i].ctrl = MACB_BIT(TX_USED);
+		bp->queues[0].tx_head = 0;
+		bp->queues[0].tx_tail = 0;
 	}
-	bp->tx_ring[TX_RING_SIZE - 1].ctrl |= MACB_BIT(TX_WRAP);
+	bp->queues[0].tx_ring[TX_RING_SIZE - 1].ctrl |= MACB_BIT(TX_WRAP);
 
-	bp->rx_tail = bp->tx_head = bp->tx_tail = 0;
+	bp->rx_tail = 0;
 }
 
 static void macb_reset_hw(struct macb *bp)
 {
+	struct macb_queue *queue;
+	unsigned int q;
+
 	/*
 	 * Disable RX and TX (XXX: Should we halt the transmission
 	 * more gracefully?)
@@ -1442,8 +1498,10 @@ static void macb_reset_hw(struct macb *bp)
 	macb_writel(bp, RSR, -1);
 
 	/* Disable all interrupts */
-	macb_writel(bp, IDR, -1);
-	macb_readl(bp, ISR);
+	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+		queue_writel(queue, IDR, -1);
+		queue_readl(queue, ISR);
+	}
 }
 
 static u32 gem_mdc_clk_div(struct macb *bp)
@@ -1540,6 +1598,9 @@ static void macb_configure_dma(struct macb *bp)
 
 static void macb_init_hw(struct macb *bp)
 {
+	struct macb_queue *queue;
+	unsigned int q;
+
 	u32 config;
 
 	macb_reset_hw(bp);
@@ -1565,16 +1626,18 @@ static void macb_init_hw(struct macb *bp)
 
 	/* Initialize TX and RX buffers */
 	macb_writel(bp, RBQP, bp->rx_ring_dma);
-	macb_writel(bp, TBQP, bp->tx_ring_dma);
+	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+		queue_writel(queue, TBQP, queue->tx_ring_dma);
+
+		/* Enable interrupts */
+		queue_writel(queue, IER,
+			     MACB_RX_INT_FLAGS |
+			     MACB_TX_INT_FLAGS |
+			     MACB_BIT(HRESP));
+	}
 
 	/* Enable TX and RX */
 	macb_writel(bp, NCR, MACB_BIT(RE) | MACB_BIT(TE) | MACB_BIT(MPE));
-
-	/* Enable interrupts */
-	macb_writel(bp, IER, (MACB_RX_INT_FLAGS
-			      | MACB_TX_INT_FLAGS
-			      | MACB_BIT(HRESP)));
-
 }
 
 /*
@@ -1736,7 +1799,7 @@ static int macb_open(struct net_device *dev)
 	/* schedule a link state check */
 	phy_start(bp->phy_dev);
 
-	netif_start_queue(dev);
+	netif_tx_start_all_queues(dev);
 
 	return 0;
 }
@@ -1746,7 +1809,7 @@ static int macb_close(struct net_device *dev)
 	struct macb *bp = netdev_priv(dev);
 	unsigned long flags;
 
-	netif_stop_queue(dev);
+	netif_tx_stop_all_queues(dev);
 	napi_disable(&bp->napi);
 
 	if (bp->phy_dev)
@@ -1895,8 +1958,8 @@ static void macb_get_regs(struct net_device *dev, struct ethtool_regs *regs,
 	regs->version = (macb_readl(bp, MID) & ((1 << MACB_REV_SIZE) - 1))
 			| MACB_GREGS_VERSION;
 
-	tail = macb_tx_ring_wrap(bp->tx_tail);
-	head = macb_tx_ring_wrap(bp->tx_head);
+	tail = macb_tx_ring_wrap(bp->queues[0].tx_tail);
+	head = macb_tx_ring_wrap(bp->queues[0].tx_head);
 
 	regs_buff[0]  = macb_readl(bp, NCR);
 	regs_buff[1]  = macb_or_gem_readl(bp, NCFGR);
@@ -1909,8 +1972,8 @@ static void macb_get_regs(struct net_device *dev, struct ethtool_regs *regs,
 
 	regs_buff[8]  = tail;
 	regs_buff[9]  = head;
-	regs_buff[10] = macb_tx_dma(bp, tail);
-	regs_buff[11] = macb_tx_dma(bp, head);
+	regs_buff[10] = macb_tx_dma(&bp->queues[0], tail);
+	regs_buff[11] = macb_tx_dma(&bp->queues[0], head);
 
 	if (macb_is_gem(bp)) {
 		regs_buff[12] = gem_readl(bp, USRIO);
@@ -2061,16 +2124,44 @@ static void macb_configure_caps(struct macb *bp)
 	netdev_dbg(bp->dev, "Cadence caps 0x%08x\n", bp->caps);
 }
 
+static void macb_probe_queues(void __iomem *mem,
+			      unsigned int *queue_mask,
+			      unsigned int *num_queues)
+{
+	unsigned int hw_q;
+	u32 mid;
+
+	*queue_mask = 0x1;
+	*num_queues = 1;
+
+	/* is it macb or gem ? */
+	mid = __raw_readl(mem + MACB_MID);
+	if (MACB_BFEXT(IDNUM, mid) != 0x2)
+		return;
+
+	/* bit 0 is never set but queue 0 always exists */
+	*queue_mask = __raw_readl(mem + GEM_DCFG6) & 0xff;
+	*queue_mask |= 0x1;
+
+	for (hw_q = 1; hw_q < MACB_MAX_QUEUES; ++hw_q)
+		if (*queue_mask & (1 << hw_q))
+			(*num_queues)++;
+}
+
 static int __init macb_probe(struct platform_device *pdev)
 {
 	struct macb_platform_data *pdata;
 	struct resource *regs;
 	struct net_device *dev;
 	struct macb *bp;
+	struct macb_queue *queue;
 	struct phy_device *phydev;
 	u32 config;
 	int err = -ENXIO;
 	const char *mac;
+	void __iomem *mem;
+	unsigned int hw_q, queue_mask, q, num_queues, q_irq = 0;
+	struct clk *pclk, *hclk, *tx_clk;
 
 	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!regs) {
@@ -2078,72 +2169,112 @@ static int __init macb_probe(struct platform_device *pdev)
 		goto err_out;
 	}
 
-	err = -ENOMEM;
-	dev = alloc_etherdev(sizeof(*bp));
-	if (!dev)
-		goto err_out;
-
-	SET_NETDEV_DEV(dev, &pdev->dev);
-
-	bp = netdev_priv(dev);
-	bp->pdev = pdev;
-	bp->dev = dev;
-
-	spin_lock_init(&bp->lock);
-	INIT_WORK(&bp->tx_error_task, macb_tx_error_task);
-
-	bp->pclk = devm_clk_get(&pdev->dev, "pclk");
-	if (IS_ERR(bp->pclk)) {
-		err = PTR_ERR(bp->pclk);
+	pclk = devm_clk_get(&pdev->dev, "pclk");
+	if (IS_ERR(pclk)) {
+		err = PTR_ERR(pclk);
 		dev_err(&pdev->dev, "failed to get macb_clk (%u)\n", err);
-		goto err_out_free_dev;
+		goto err_out;
 	}
 
-	bp->hclk = devm_clk_get(&pdev->dev, "hclk");
-	if (IS_ERR(bp->hclk)) {
-		err = PTR_ERR(bp->hclk);
+	hclk = devm_clk_get(&pdev->dev, "hclk");
+	if (IS_ERR(hclk)) {
+		err = PTR_ERR(hclk);
 		dev_err(&pdev->dev, "failed to get hclk (%u)\n", err);
-		goto err_out_free_dev;
+		goto err_out;
 	}
 
-	bp->tx_clk = devm_clk_get(&pdev->dev, "tx_clk");
+	tx_clk = devm_clk_get(&pdev->dev, "tx_clk");
 
-	err = clk_prepare_enable(bp->pclk);
+	err = clk_prepare_enable(pclk);
 	if (err) {
 		dev_err(&pdev->dev, "failed to enable pclk (%u)\n", err);
-		goto err_out_free_dev;
+		goto err_out;
 	}
 
-	err = clk_prepare_enable(bp->hclk);
+	err = clk_prepare_enable(hclk);
 	if (err) {
 		dev_err(&pdev->dev, "failed to enable hclk (%u)\n", err);
 		goto err_out_disable_pclk;
 	}
 
-	if (!IS_ERR(bp->tx_clk)) {
-		err = clk_prepare_enable(bp->tx_clk);
+	if (!IS_ERR(tx_clk)) {
+		err = clk_prepare_enable(tx_clk);
 		if (err) {
 			dev_err(&pdev->dev, "failed to enable tx_clk (%u)\n",
-					err);
+				err);
 			goto err_out_disable_hclk;
 		}
 	}
 
-	bp->regs = devm_ioremap(&pdev->dev, regs->start, resource_size(regs));
-	if (!bp->regs) {
+	err = -ENOMEM;
+	mem = devm_ioremap(&pdev->dev, regs->start, resource_size(regs));
+	if (!mem) {
 		dev_err(&pdev->dev, "failed to map registers, aborting.\n");
-		err = -ENOMEM;
 		goto err_out_disable_clocks;
 	}
 
-	dev->irq = platform_get_irq(pdev, 0);
-	err = devm_request_irq(&pdev->dev, dev->irq, macb_interrupt, 0,
-			dev->name, dev);
-	if (err) {
-		dev_err(&pdev->dev, "Unable to request IRQ %d (error %d)\n",
-			dev->irq, err);
+	macb_probe_queues(mem, &queue_mask, &num_queues);
+	dev = alloc_etherdev_mq(sizeof(*bp), num_queues);
+	if (!dev)
 		goto err_out_disable_clocks;
+
+	SET_NETDEV_DEV(dev, &pdev->dev);
+
+	bp = netdev_priv(dev);
+	bp->pdev = pdev;
+	bp->dev = dev;
+	bp->regs = mem;
+	bp->num_queues = num_queues;
+	bp->pclk = pclk;
+	bp->hclk = hclk;
+	bp->tx_clk = tx_clk;
+
+	spin_lock_init(&bp->lock);
+
+	/* set the queue register mapping once for all: queue0 has a special
+	 * register mapping but we don't want to test the queue index then
+	 * compute the corresponding register offset at run time.
+	 */
+	for (hw_q = 0; hw_q < MACB_MAX_QUEUES; ++hw_q) {
+		if (!(queue_mask & (1 << hw_q)))
+			continue;
+
+		queue = &bp->queues[q_irq];
+		queue->bp = bp;
+		if (hw_q) {
+			queue->ISR  = GEM_ISR(hw_q - 1);
+			queue->IER  = GEM_IER(hw_q - 1);
+			queue->IDR  = GEM_IDR(hw_q - 1);
+			queue->IMR  = GEM_IMR(hw_q - 1);
+			queue->TBQP = GEM_TBQP(hw_q - 1);
+		} else {
+			/* queue0 uses legacy registers */
+			queue->ISR  = MACB_ISR;
+			queue->IER  = MACB_IER;
+			queue->IDR  = MACB_IDR;
+			queue->IMR  = MACB_IMR;
+			queue->TBQP = MACB_TBQP;
+		}
+
+		/* get irq: here we use the linux queue index, not the hardware
+		 * queue index. the queue irq definitions in the device tree
+		 * must remove the optional gaps that could exist in the
+		 * hardware queue mask.
+		 */
+		queue->irq = platform_get_irq(pdev, q_irq);
+		err = devm_request_irq(&pdev->dev, queue->irq, macb_interrupt,
+				       0, dev->name, queue);
+		if (err) {
+			dev_err(&pdev->dev,
+				"Unable to request IRQ %d (error %d)\n",
+				queue->irq, err);
+			goto err_out_free_irq;
+		}
+
+		INIT_WORK(&queue->tx_error_task, macb_tx_error_task);
+		q_irq++;
 	}
+	dev->irq = bp->queues[0].irq;
 
 	dev->netdev_ops = &macb_netdev_ops;
 	netif_napi_add(dev, &bp->napi, macb_poll, 64);
@@ -2219,7 +2350,7 @@ static int __init macb_probe(struct platform_device *pdev)
 	err = register_netdev(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Cannot register net device, aborting.\n");
-		goto err_out_disable_clocks;
+		goto err_out_free_irq;
 	}
 
 	err = macb_mii_init(bp);
@@ -2242,15 +2373,17 @@ static int __init macb_probe(struct platform_device *pdev)
 
 err_out_unregister_netdev:
 	unregister_netdev(dev);
+err_out_free_irq:
+	for (q = 0, queue = bp->queues; q < q_irq; ++q, ++queue)
+		devm_free_irq(&pdev->dev, queue->irq, queue);
+	free_netdev(dev);
 err_out_disable_clocks:
-	if (!IS_ERR(bp->tx_clk))
-		clk_disable_unprepare(bp->tx_clk);
+	if (!IS_ERR(tx_clk))
+		clk_disable_unprepare(tx_clk);
 err_out_disable_hclk:
-	clk_disable_unprepare(bp->hclk);
+	clk_disable_unprepare(hclk);
 err_out_disable_pclk:
-	clk_disable_unprepare(bp->pclk);
-err_out_free_dev:
-	free_netdev(dev);
+	clk_disable_unprepare(pclk);
 err_out:
 	return err;
 }
@@ -2259,6 +2392,8 @@ static int __exit macb_remove(struct platform_device *pdev)
 {
 	struct net_device *dev;
 	struct macb *bp;
+	struct macb_queue *queue;
+	unsigned int q;
 
 	dev = platform_get_drvdata(pdev);
 
@@ -2270,11 +2405,14 @@ static int __exit macb_remove(struct platform_device *pdev)
 		kfree(bp->mii_bus->irq);
 		mdiobus_free(bp->mii_bus);
 		unregister_netdev(dev);
+		queue = bp->queues;
+		for (q = 0; q < bp->num_queues; ++q, ++queue)
+			devm_free_irq(&pdev->dev, queue->irq, queue);
+		free_netdev(dev);
 		if (!IS_ERR(bp->tx_clk))
 			clk_disable_unprepare(bp->tx_clk);
 		clk_disable_unprepare(bp->hclk);
 		clk_disable_unprepare(bp->pclk);
-		free_netdev(dev);
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index 517c09d72c4a5c2e27de75d95495bbc17f8602d7..084191b6fad2065331947c2b1d377ae943a545d1 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -12,6 +12,7 @@
 
 #define MACB_GREGS_NBR 16
 #define MACB_GREGS_VERSION 1
+#define MACB_MAX_QUEUES 8
 
 /* MACB register offsets */
 #define MACB_NCR				0x0000
@@ -89,6 +90,13 @@
 #define GEM_DCFG6				0x0294
 #define GEM_DCFG7				0x0298
 
+#define GEM_ISR(hw_q)				(0x0400 + ((hw_q) << 2))
+#define GEM_TBQP(hw_q)				(0x0440 + ((hw_q) << 2))
+#define GEM_RBQP(hw_q)				(0x0480 + ((hw_q) << 2))
+#define GEM_IER(hw_q)				(0x0600 + ((hw_q) << 2))
+#define GEM_IDR(hw_q)				(0x0620 + ((hw_q) << 2))
+#define GEM_IMR(hw_q)				(0x0640 + ((hw_q) << 2))
+
 /* Bitfields in NCR */
 #define MACB_LB_OFFSET				0
 #define MACB_LB_SIZE				1
@@ -376,6 +384,10 @@
 	__raw_readl((port)->regs + GEM_##reg)
 #define gem_writel(port, reg, value)			\
 	__raw_writel((value), (port)->regs + GEM_##reg)
+#define queue_readl(queue, reg)				\
+	__raw_readl((queue)->bp->regs + (queue)->reg)
+#define queue_writel(queue, reg, value)			\
+	__raw_writel((value), (queue)->bp->regs + (queue)->reg)
 
 /*
  * Conditional GEM/MACB macros.  These perform the operation to the correct
@@ -597,6 +609,23 @@ struct macb_config {
 	unsigned int		dma_burst_length;
 };
 
+struct macb_queue {
+	struct macb		*bp;
+	int			irq;
+
+	unsigned int		ISR;
+	unsigned int		IER;
+	unsigned int		IDR;
+	unsigned int		IMR;
+	unsigned int		TBQP;
+
+	unsigned int		tx_head, tx_tail;
+	struct macb_dma_desc	*tx_ring;
+	struct macb_tx_skb	*tx_skb;
+	dma_addr_t		tx_ring_dma;
+	struct work_struct	tx_error_task;
+};
+
 struct macb {
 	void __iomem		*regs;
 
@@ -607,9 +636,8 @@ struct macb {
 	void			*rx_buffers;
 	size_t			rx_buffer_size;
 
-	unsigned int		tx_head, tx_tail;
-	struct macb_dma_desc	*tx_ring;
-	struct macb_tx_skb	*tx_skb;
+	unsigned int		num_queues;
+	struct macb_queue	queues[MACB_MAX_QUEUES];
 
 	spinlock_t		lock;
 	struct platform_device	*pdev;
@@ -618,7 +646,6 @@ struct macb {
 	struct clk		*tx_clk;
 	struct net_device	*dev;
 	struct napi_struct	napi;
-	struct work_struct	tx_error_task;
 	struct net_device_stats	stats;
 	union {
 		struct macb_stats	macb;
@@ -626,7 +653,6 @@ struct macb {
 	}			hw_stats;
 
 	dma_addr_t		rx_ring_dma;
-	dma_addr_t		tx_ring_dma;
 	dma_addr_t		rx_buffers_dma;
 
 	struct macb_or_gem_ops	macbgem_ops;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index a18d33fdb27107e2252bc12177d21b4f0f4aeb99..5ab5c3133acd3136df281f22487ae7ea5a5d86bf 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -392,7 +392,7 @@ struct port_info {
 	s16    xact_addr_filt;        /* index of exact MAC address filter */
 	u16    rss_size;              /* size of VI's RSS table slice */
 	s8     mdio_addr;
-	u8     port_type;
+	enum fw_port_type port_type;
 	u8     mod_type;
 	u8     port_id;
 	u8     tx_chan;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 973dbb7938c3f7c8107a5ca2bbb577f6e452e0ff..ccf3436024bc8ce8ffdc814553469a523db73c92 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2325,7 +2325,7 @@ static int identify_port(struct net_device *dev,
 	return t4_identify_port(adap, adap->fn, netdev2pinfo(dev)->viid, val);
 }
 
-static unsigned int from_fw_linkcaps(unsigned int type, unsigned int caps)
+static unsigned int from_fw_linkcaps(enum fw_port_type type, unsigned int caps)
 {
 	unsigned int v = 0;
 
@@ -2354,14 +2354,20 @@ static unsigned int from_fw_linkcaps(unsigned int type, unsigned int caps)
 		     SUPPORTED_10000baseKR_Full | SUPPORTED_1000baseKX_Full |
 		     SUPPORTED_10000baseKX4_Full;
 	else if (type == FW_PORT_TYPE_FIBER_XFI ||
-		 type == FW_PORT_TYPE_FIBER_XAUI || type == FW_PORT_TYPE_SFP) {
+		 type == FW_PORT_TYPE_FIBER_XAUI ||
+		 type == FW_PORT_TYPE_SFP ||
+		 type == FW_PORT_TYPE_QSFP_10G ||
+		 type == FW_PORT_TYPE_QSA) {
 		v |= SUPPORTED_FIBRE;
 		if (caps & FW_PORT_CAP_SPEED_1G)
 			v |= SUPPORTED_1000baseT_Full;
 		if (caps & FW_PORT_CAP_SPEED_10G)
 			v |= SUPPORTED_10000baseT_Full;
-	} else if (type == FW_PORT_TYPE_BP40_BA)
+	} else if (type == FW_PORT_TYPE_BP40_BA ||
+		   type == FW_PORT_TYPE_QSFP) {
 		v |= SUPPORTED_40000baseSR4_Full;
+		v |= SUPPORTED_FIBRE;
+	}
 
 	if (caps & FW_PORT_CAP_ANEG)
 		v |= SUPPORTED_Autoneg;
@@ -2396,6 +2402,7 @@ static int get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 		cmd->port = PORT_FIBRE;
 	else if (p->port_type == FW_PORT_TYPE_SFP ||
 		 p->port_type == FW_PORT_TYPE_QSFP_10G ||
+		 p->port_type == FW_PORT_TYPE_QSA ||
 		 p->port_type == FW_PORT_TYPE_QSFP) {
 		if (p->mod_type == FW_PORT_MOD_TYPE_LR ||
 		    p->mod_type == FW_PORT_MOD_TYPE_SR ||
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index beaf80a6214b9175d36b0c32d15ed3eccb866dd2..291b6f219708c946c5ad01c9bc0485d06dc7f19c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -560,6 +560,7 @@ enum fw_flowc_mnem {
 	FW_FLOWC_MNEM_RCVNXT,
 	FW_FLOWC_MNEM_SNDBUF,
 	FW_FLOWC_MNEM_MSS,
+	FW_FLOWC_MNEM_TXDATAPLEN_MAX,
 };
 
 struct fw_flowc_mnemval {
@@ -2470,6 +2471,7 @@ enum fw_port_type {
 	FW_PORT_TYPE_BP4_AP,
 	FW_PORT_TYPE_QSFP_10G,
 	FW_PORT_TYPE_QSFP,
+	FW_PORT_TYPE_QSA,
 	FW_PORT_TYPE_BP40_BA,
 
 	FW_PORT_TYPE_NONE = FW_PORT_CMD_PTYPE_M
diff --git a/drivers/net/ethernet/davicom/Kconfig b/drivers/net/ethernet/davicom/Kconfig
index 316c5e5a92ad450372cd5eb8fa81b70a7df795a7..7ec2d74f94d3f3fdf70edef20779b3a2bc40bd7c 100644
--- a/drivers/net/ethernet/davicom/Kconfig
+++ b/drivers/net/ethernet/davicom/Kconfig
@@ -4,7 +4,7 @@
 
 config DM9000
 	tristate "DM9000 support"
-	depends on ARM || BLACKFIN || MIPS || COLDFIRE
+	depends on ARM || BLACKFIN || MIPS || COLDFIRE || NIOS2
 	select CRC32
 	select MII
 	---help---
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index ee1ecb146df7de40e126cbffca7c80528254ef8e..eb088b129bc717ae786880b4d08d5d3260ddc1c7 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -615,14 +615,14 @@ static bool fm10k_clean_rx_irq(struct fm10k_q_vector *q_vector,
 
 		rx_desc = FM10K_RX_DESC(rx_ring, rx_ring->next_to_clean);
 
-		if (!fm10k_test_staterr(rx_desc, FM10K_RXD_STATUS_DD))
+		if (!rx_desc->d.staterr)
 			break;
 
 		/* This memory barrier is needed to keep us from reading
 		 * any other fields out of the rx_desc until we know the
-		 * RXD_STATUS_DD bit is set
+		 * descriptor has been written back
 		 */
-		rmb();
+		dma_rmb();
 
 		/* retrieve a buffer from the ring */
 		skb = fm10k_fetch_rx_buffer(rx_ring, rx_desc, skb);
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 2e526d4904a6912638030297a9f60abd792d5f53..ff59897a946363442d657e84784962c6a600a45b 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6910,14 +6910,14 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 
 		rx_desc = IGB_RX_DESC(rx_ring, rx_ring->next_to_clean);
 
-		if (!igb_test_staterr(rx_desc, E1000_RXD_STAT_DD))
+		if (!rx_desc->wb.upper.status_error)
 			break;
 
 		/* This memory barrier is needed to keep us from reading
 		 * any other fields out of the rx_desc until we know the
-		 * RXD_STAT_DD bit is set
+		 * descriptor has been written back
 		 */
-		rmb();
+		dma_rmb();
 
 		/* retrieve a buffer from the ring */
 		skb = igb_fetch_rx_buffer(rx_ring, rx_desc, skb);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 798b05556e1bcced693bc680ac076992e1a59e85..2ed2c7de230444f88c3f06451d7cc8a7167f5f05 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2009,15 +2009,14 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 
 		rx_desc = IXGBE_RX_DESC(rx_ring, rx_ring->next_to_clean);
 
-		if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_DD))
+		if (!rx_desc->wb.upper.status_error)
 			break;
 
-		/*
-		 * This memory barrier is needed to keep us from reading
+		/* This memory barrier is needed to keep us from reading
 		 * any other fields out of the rx_desc until we know the
-		 * RXD_STAT_DD bit is set
+		 * descriptor has been written back
 		 */
-		rmb();
+		dma_rmb();
 
 		/* retrieve a buffer from the ring */
 		skb = ixgbe_fetch_rx_buffer(rx_ring, rx_desc);
diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index 4a1be34d7214c9971e5c254f61bbfa3ff647bfe9..44ce7d88f554638035090cf165f0ffdbf1b2902f 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -1364,8 +1364,8 @@ jme_link_change_tasklet(unsigned long arg)
 	jme_free_rx_resources(jme);
 out_enable_tasklet:
 	tasklet_enable(&jme->txclean_task);
-	tasklet_hi_enable(&jme->rxclean_task);
-	tasklet_hi_enable(&jme->rxempty_task);
+	tasklet_enable(&jme->rxclean_task);
+	tasklet_enable(&jme->rxempty_task);
 out:
 	atomic_inc(&jme->link_changing);
 }
@@ -2408,8 +2408,8 @@ static inline void jme_resume_rx(struct jme_adapter *jme)
 	if (test_bit(JME_FLAG_POLL, &jme->flags)) {
 		JME_NAPI_ENABLE(jme);
 	} else {
-		tasklet_hi_enable(&jme->rxclean_task);
-		tasklet_hi_enable(&jme->rxempty_task);
+		tasklet_enable(&jme->rxclean_task);
+		tasklet_enable(&jme->rxempty_task);
 	}
 	dpi->cur		= PCC_P1;
 	dpi->attempt		= PCC_P1;
@@ -3290,8 +3290,8 @@ jme_suspend(struct device *dev)
 	}
 
 	tasklet_enable(&jme->txclean_task);
-	tasklet_hi_enable(&jme->rxclean_task);
-	tasklet_hi_enable(&jme->rxempty_task);
+	tasklet_enable(&jme->rxclean_task);
+	tasklet_enable(&jme->rxempty_task);
 
 	jme_powersave_phy(jme);
 
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 3dad7e88495210710a800951c8c7849ccbfa3c5f..14a1c5cec3a59fc8699858a507ca3b077b45981b 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -5919,7 +5919,7 @@ static void rtl_hw_start_8411(struct rtl8169_private *tp)
 	rtl_w0w1_eri(tp, 0x0d4, ERIAR_MASK_0011, 0x0c00, 0x0000, ERIAR_EXGMAC);
 }
 
-static void rtl_hw_start_8168g_1(struct rtl8169_private *tp)
+static void rtl_hw_start_8168g(struct rtl8169_private *tp)
 {
 	void __iomem *ioaddr = tp->mmio_addr;
 	struct pci_dev *pdev = tp->pci_dev;
@@ -5954,6 +5954,24 @@ static void rtl_hw_start_8168g_1(struct rtl8169_private *tp)
 	rtl_pcie_state_l2l3_enable(tp, false);
 }
 
+static void rtl_hw_start_8168g_1(struct rtl8169_private *tp)
+{
+	void __iomem *ioaddr = tp->mmio_addr;
+	static const struct ephy_info e_info_8168g_1[] = {
+		{ 0x00, 0x0000,	0x0008 },
+		{ 0x0c, 0x37d0,	0x0820 },
+		{ 0x1e, 0x0000,	0x0001 },
+		{ 0x19, 0x8000,	0x0000 }
+	};
+
+	rtl_hw_start_8168g(tp);
+
+	/* disable aspm and clock request before access ephy */
+	RTL_W8(Config2, RTL_R8(Config2) & ~ClkReqEn);
+	RTL_W8(Config5, RTL_R8(Config5) & ~ASPM_en);
+	rtl_ephy_init(tp, e_info_8168g_1, ARRAY_SIZE(e_info_8168g_1));
+}
+
 static void rtl_hw_start_8168g_2(struct rtl8169_private *tp)
 {
 	void __iomem *ioaddr = tp->mmio_addr;
@@ -5964,7 +5982,7 @@ static void rtl_hw_start_8168g_2(struct rtl8169_private *tp)
 		{ 0x1e, 0xffff,	0x20eb }
 	};
 
-	rtl_hw_start_8168g_1(tp);
+	rtl_hw_start_8168g(tp);
 
 	/* disable aspm and clock request before access ephy */
 	RTL_W8(Config2, RTL_R8(Config2) & ~ClkReqEn);
@@ -5983,7 +6001,7 @@ static void rtl_hw_start_8411_2(struct rtl8169_private *tp)
 		{ 0x1e, 0x0000,	0x2000 }
 	};
 
-	rtl_hw_start_8168g_1(tp);
+	rtl_hw_start_8168g(tp);
 
 	/* disable aspm and clock request before access ephy */
 	RTL_W8(Config2, RTL_R8(Config2) & ~ClkReqEn);
@@ -6605,6 +6623,9 @@ static inline void rtl8169_mark_to_asic(struct RxDesc *desc, u32 rx_buf_sz)
 {
 	u32 eor = le32_to_cpu(desc->opts1) & RingEnd;
 
+	/* Force memory writes to complete before releasing descriptor */
+	dma_wmb();
+
 	desc->opts1 = cpu_to_le32(DescOwn | eor | rx_buf_sz);
 }
 
@@ -6612,7 +6633,6 @@ static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping,
 				       u32 rx_buf_sz)
 {
 	desc->addr = cpu_to_le64(mapping);
-	wmb();
 	rtl8169_mark_to_asic(desc, rx_buf_sz);
 }
 
@@ -7073,16 +7093,18 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
 
 	skb_tx_timestamp(skb);
 
-	wmb();
+	/* Force memory writes to complete before releasing descriptor */
+	dma_wmb();
 
 	/* Anti gcc 2.95.3 bugware (sic) */
 	status = opts[0] | len | (RingEnd * !((entry + 1) % NUM_TX_DESC));
 	txd->opts1 = cpu_to_le32(status);
 
-	tp->cur_tx += frags + 1;
-
+	/* Force all memory writes to complete before notifying device */
 	wmb();
 
+	tp->cur_tx += frags + 1;
+
 	RTL_W8(TxPoll, NPQ);
 
 	mmiowb();
@@ -7181,11 +7203,16 @@ static void rtl_tx(struct net_device *dev, struct rtl8169_private *tp)
 		struct ring_info *tx_skb = tp->tx_skb + entry;
 		u32 status;
 
-		rmb();
 		status = le32_to_cpu(tp->TxDescArray[entry].opts1);
 		if (status & DescOwn)
 			break;
 
+		/* This barrier is needed to keep us from reading
+		 * any other fields out of the Tx descriptor until
+		 * we know the status of DescOwn
+		 */
+		dma_rmb();
+
 		rtl8169_unmap_tx_skb(&tp->pci_dev->dev, tx_skb,
 				     tp->TxDescArray + entry);
 		if (status & LastFrag) {
@@ -7280,11 +7307,16 @@ static int rtl_rx(struct net_device *dev, struct rtl8169_private *tp, u32 budget
 		struct RxDesc *desc = tp->RxDescArray + entry;
 		u32 status;
 
-		rmb();
 		status = le32_to_cpu(desc->opts1) & tp->opts1_mask;
-
 		if (status & DescOwn)
 			break;
+
+		/* This barrier is needed to keep us from reading
+		 * any other fields out of the Rx descriptor until
+		 * we know the status of DescOwn
+		 */
+		dma_rmb();
+
 		if (unlikely(status & RxRES)) {
 			netif_info(tp, rx_err, dev, "Rx ERROR. status = %08x\n",
 				   status);
@@ -7346,7 +7378,6 @@ static int rtl_rx(struct net_device *dev, struct rtl8169_private *tp, u32 budget
 		}
 release_descriptor:
 		desc->opts2 = 0;
-		wmb();
 		rtl8169_mark_to_asic(desc, rx_buf_sz);
 	}
 
diff --git a/drivers/net/ethernet/smsc/Kconfig b/drivers/net/ethernet/smsc/Kconfig
index 753630f5d3d35d15ad0d596ccff5c082bcc7bbbe..627926800ff38fe16467c099cb58517cf37456a1 100644
--- a/drivers/net/ethernet/smsc/Kconfig
+++ b/drivers/net/ethernet/smsc/Kconfig
@@ -6,7 +6,7 @@ config NET_VENDOR_SMSC
 	bool "SMC (SMSC)/Western Digital devices"
 	default y
 	depends on ARM || ISA || MAC || ARM64 || MIPS || M32R || SUPERH || \
-		BLACKFIN || MN10300 || COLDFIRE || XTENSA || PCI || PCMCIA
+		BLACKFIN || MN10300 || COLDFIRE || XTENSA || NIOS2 || PCI || PCMCIA
 	---help---
 	  If you have a network (Ethernet) card belonging to this class, say Y
 	  and read the Ethernet-HOWTO, available from
@@ -39,7 +39,7 @@ config SMC91X
 	select CRC32
 	select MII
 	depends on (ARM || M32R || SUPERH || MIPS || BLACKFIN || \
-		    MN10300 || COLDFIRE || ARM64 || XTENSA)
+		    MN10300 || COLDFIRE || ARM64 || XTENSA || NIOS2)
 	---help---
 	  This is a driver for SMC's 91x series of Ethernet chipsets,
 	  including the SMC91C94 and the SMC91C111. Say Y if you want it
diff --git a/drivers/net/phy/fixed.c b/drivers/net/phy/fixed.c
index 47872caa0081dbc1974c4aa199f035d83986757d..3ad0e6e16c395e61c0f31bd9ab6b30b3fb6adf4e 100644
--- a/drivers/net/phy/fixed.c
+++ b/drivers/net/phy/fixed.c
@@ -274,6 +274,7 @@ struct phy_device *fixed_phy_register(unsigned int irq,
 
 	return phy;
 }
+EXPORT_SYMBOL_GPL(fixed_phy_register);
 
 static int __init fixed_mdio_bus_init(void)
 {
diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
index 69fbfc89efb697ba139228b71e8e9bb111ea185d..a83d2ceded83e033e322343009cc3a054dfb2fdf 100644
--- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
+++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
@@ -75,6 +75,7 @@ typedef void (*cxgb4i_cplhandler_func)(struct cxgbi_device *, struct sk_buff *);
 static void *t4_uld_add(const struct cxgb4_lld_info *);
 static int t4_uld_rx_handler(void *, const __be64 *, const struct pkt_gl *);
 static int t4_uld_state_change(void *, enum cxgb4_state state);
+static inline int send_tx_flowc_wr(struct cxgbi_sock *);
 
 static const struct cxgb4_uld_info cxgb4i_uld_info = {
 	.name = DRV_MODULE_NAME,
@@ -157,12 +158,6 @@ static struct scsi_transport_template *cxgb4i_stt;
 #define RCV_BUFSIZ_MASK		0x3FFU
 #define MAX_IMM_TX_PKT_LEN	128
 
-static inline void set_queue(struct sk_buff *skb, unsigned int queue,
-				const struct cxgbi_sock *csk)
-{
-	skb->queue_mapping = queue;
-}
-
 static int push_tx_frames(struct cxgbi_sock *, int);
 
 /*
@@ -172,10 +167,14 @@ static int push_tx_frames(struct cxgbi_sock *, int);
  * Returns true if a packet can be sent as an offload WR with immediate
  * data.  We currently use the same limit as for Ethernet packets.
  */
-static inline int is_ofld_imm(const struct sk_buff *skb)
+static inline bool is_ofld_imm(const struct sk_buff *skb)
 {
-	return skb->len <= (MAX_IMM_TX_PKT_LEN -
-			sizeof(struct fw_ofld_tx_data_wr));
+	int len = skb->len;
+
+	if (likely(cxgbi_skcb_test_flag(skb, SKCBF_TX_NEED_HDR)))
+		len += sizeof(struct fw_ofld_tx_data_wr);
+
+	return len <= MAX_IMM_TX_PKT_LEN;
 }
 
 static void send_act_open_req(struct cxgbi_sock *csk, struct sk_buff *skb,
@@ -388,13 +387,19 @@ static void send_abort_req(struct cxgbi_sock *csk)
 
 	if (unlikely(csk->state == CTP_ABORTING) || !skb || !csk->cdev)
 		return;
+
+	if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT)) {
+		send_tx_flowc_wr(csk);
+		cxgbi_sock_set_flag(csk, CTPF_TX_DATA_SENT);
+	}
+
 	cxgbi_sock_set_state(csk, CTP_ABORTING);
 	cxgbi_sock_set_flag(csk, CTPF_ABORT_RPL_PENDING);
 	cxgbi_sock_purge_write_queue(csk);
 
 	csk->cpl_abort_req = NULL;
 	req = (struct cpl_abort_req *)skb->head;
-	set_queue(skb, CPL_PRIORITY_DATA, csk);
+	set_wr_txq(skb, CPL_PRIORITY_DATA, csk->port_id);
 	req->cmd = CPL_ABORT_SEND_RST;
 	t4_set_arp_err_handler(skb, csk, abort_arp_failure);
 	INIT_TP_WR(req, csk->tid);
@@ -420,7 +425,7 @@ static void send_abort_rpl(struct cxgbi_sock *csk, int rst_status)
 		csk, csk->state, csk->flags, csk->tid, rst_status);
 
 	csk->cpl_abort_rpl = NULL;
-	set_queue(skb, CPL_PRIORITY_DATA, csk);
+	set_wr_txq(skb, CPL_PRIORITY_DATA, csk->port_id);
 	INIT_TP_WR(rpl, csk->tid);
 	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_RPL, csk->tid));
 	rpl->cmd = rst_status;
@@ -491,20 +496,40 @@ static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb)
 	return flits + sgl_len(cnt);
 }
 
-static inline void send_tx_flowc_wr(struct cxgbi_sock *csk)
+#define FLOWC_WR_NPARAMS_MIN	9
+static inline int tx_flowc_wr_credits(int *nparamsp, int *flowclenp)
+{
+	int nparams, flowclen16, flowclen;
+
+	nparams = FLOWC_WR_NPARAMS_MIN;
+	flowclen = offsetof(struct fw_flowc_wr, mnemval[nparams]);
+	flowclen16 = DIV_ROUND_UP(flowclen, 16);
+	flowclen = flowclen16 * 16;
+	/*
+	 * Return the number of 16-byte credits used by the FlowC request.
+	 * Pass back the nparams and actual FlowC length if requested.
+	 */
+	if (nparamsp)
+		*nparamsp = nparams;
+	if (flowclenp)
+		*flowclenp = flowclen;
+
+	return flowclen16;
+}
+
+static inline int send_tx_flowc_wr(struct cxgbi_sock *csk)
 {
 	struct sk_buff *skb;
 	struct fw_flowc_wr *flowc;
-	int flowclen, i;
+	int nparams, flowclen16, flowclen;
 
-	flowclen = 80;
+	flowclen16 = tx_flowc_wr_credits(&nparams, &flowclen);
 	skb = alloc_wr(flowclen, 0, GFP_ATOMIC);
 	flowc = (struct fw_flowc_wr *)skb->head;
 	flowc->op_to_nparams =
-		htonl(FW_WR_OP_V(FW_FLOWC_WR) | FW_FLOWC_WR_NPARAMS_V(8));
+		htonl(FW_WR_OP_V(FW_FLOWC_WR) | FW_FLOWC_WR_NPARAMS_V(nparams));
 	flowc->flowid_len16 =
-		htonl(FW_WR_LEN16_V(DIV_ROUND_UP(72, 16)) |
-				FW_WR_FLOWID_V(csk->tid));
+		htonl(FW_WR_LEN16_V(flowclen16) | FW_WR_FLOWID_V(csk->tid));
 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
 	flowc->mnemval[0].val = htonl(csk->cdev->pfvf);
 	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
@@ -523,12 +548,10 @@ static inline void send_tx_flowc_wr(struct cxgbi_sock *csk)
 	flowc->mnemval[7].val = htonl(csk->advmss);
 	flowc->mnemval[8].mnemonic = 0;
 	flowc->mnemval[8].val = 0;
-	for (i = 0; i < 9; i++) {
-		flowc->mnemval[i].r4[0] = 0;
-		flowc->mnemval[i].r4[1] = 0;
-		flowc->mnemval[i].r4[2] = 0;
-	}
-	set_queue(skb, CPL_PRIORITY_DATA, csk);
+	flowc->mnemval[8].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX;
+	flowc->mnemval[8].val = 16384;
+
+	set_wr_txq(skb, CPL_PRIORITY_DATA, csk->port_id);
 
 	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
 		"csk 0x%p, tid 0x%x, %u,%u,%u,%u,%u,%u,%u.\n",
@@ -537,6 +560,8 @@ static inline void send_tx_flowc_wr(struct cxgbi_sock *csk)
 		csk->advmss);
 
 	cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb);
+
+	return flowclen16;
 }
 
 static inline void make_tx_data_wr(struct cxgbi_sock *csk, struct sk_buff *skb,
@@ -545,10 +570,11 @@ static inline void make_tx_data_wr(struct cxgbi_sock *csk, struct sk_buff *skb,
 	struct fw_ofld_tx_data_wr *req;
 	unsigned int submode = cxgbi_skcb_ulp_mode(skb) & 3;
 	unsigned int wr_ulp_mode = 0, val;
+	bool imm = is_ofld_imm(skb);
 
 	req = (struct fw_ofld_tx_data_wr *)__skb_push(skb, sizeof(*req));
 
-	if (is_ofld_imm(skb)) {
+	if (imm) {
 		req->op_to_immdlen = htonl(FW_WR_OP_V(FW_OFLD_TX_DATA_WR) |
 					FW_WR_COMPL_F |
 					FW_WR_IMMDLEN_V(dlen));
@@ -597,16 +623,32 @@ static int push_tx_frames(struct cxgbi_sock *csk, int req_completion)
 		int dlen = skb->len;
 		int len = skb->len;
 		unsigned int credits_needed;
+		int flowclen16 = 0;
 
 		skb_reset_transport_header(skb);
 		if (is_ofld_imm(skb))
-			credits_needed = DIV_ROUND_UP(dlen +
-					sizeof(struct fw_ofld_tx_data_wr), 16);
+			credits_needed = DIV_ROUND_UP(dlen, 16);
 		else
-			credits_needed = DIV_ROUND_UP(8*calc_tx_flits_ofld(skb)
-					+ sizeof(struct fw_ofld_tx_data_wr),
+			credits_needed = DIV_ROUND_UP(
+						8 * calc_tx_flits_ofld(skb),
+						16);
+
+		if (likely(cxgbi_skcb_test_flag(skb, SKCBF_TX_NEED_HDR)))
+			credits_needed += DIV_ROUND_UP(
+					sizeof(struct fw_ofld_tx_data_wr),
 					16);
 
+		/*
+		 * Assumes the initial credits is large enough to support
+		 * fw_flowc_wr plus largest possible first payload
+		 */
+		if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT)) {
+			flowclen16 = send_tx_flowc_wr(csk);
+			csk->wr_cred -= flowclen16;
+			csk->wr_una_cred += flowclen16;
+			cxgbi_sock_set_flag(csk, CTPF_TX_DATA_SENT);
+		}
+
 		if (csk->wr_cred < credits_needed) {
 			log_debug(1 << CXGBI_DBG_PDU_TX,
 				"csk 0x%p, skb %u/%u, wr %d < %u.\n",
@@ -615,8 +657,8 @@ static int push_tx_frames(struct cxgbi_sock *csk, int req_completion)
 			break;
 		}
 		__skb_unlink(skb, &csk->write_queue);
-		set_queue(skb, CPL_PRIORITY_DATA, csk);
-		skb->csum = credits_needed;
+		set_wr_txq(skb, CPL_PRIORITY_DATA, csk->port_id);
+		skb->csum = credits_needed + flowclen16;
 		csk->wr_cred -= credits_needed;
 		csk->wr_una_cred += credits_needed;
 		cxgbi_sock_enqueue_wr(csk, skb);
@@ -627,12 +669,6 @@ static int push_tx_frames(struct cxgbi_sock *csk, int req_completion)
 			csk->wr_cred, csk->wr_una_cred);
 
 		if (likely(cxgbi_skcb_test_flag(skb, SKCBF_TX_NEED_HDR))) {
-			if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT)) {
-				send_tx_flowc_wr(csk);
-				skb->csum += 5;
-				csk->wr_cred -= 5;
-				csk->wr_una_cred += 5;
-			}
 			len += cxgbi_ulp_extra_len(cxgbi_skcb_ulp_mode(skb));
 			make_tx_data_wr(csk, skb, dlen, len, credits_needed,
 					req_completion);
@@ -807,6 +843,13 @@ static void csk_act_open_retry_timer(unsigned long data)
 
 }
 
+static inline bool is_neg_adv(unsigned int status)
+{
+	return status == CPL_ERR_RTX_NEG_ADVICE ||
+		status == CPL_ERR_KEEPALV_NEG_ADVICE ||
+		status == CPL_ERR_PERSIST_NEG_ADVICE;
+}
+
 static void do_act_open_rpl(struct cxgbi_device *cdev, struct sk_buff *skb)
 {
 	struct cxgbi_sock *csk;
@@ -828,7 +871,7 @@ static void do_act_open_rpl(struct cxgbi_device *cdev, struct sk_buff *skb)
 		       "csk 0x%p,%u,0x%lx. ", (&csk->saddr), (&csk->daddr),
 		       atid, tid, status, csk, csk->state, csk->flags);
 
-	if (status == CPL_ERR_RTX_NEG_ADVICE)
+	if (is_neg_adv(status))
 		goto rel_skb;
 
 	module_put(THIS_MODULE);
@@ -934,8 +977,7 @@ static void do_abort_req_rss(struct cxgbi_device *cdev, struct sk_buff *skb)
 		       (&csk->saddr), (&csk->daddr),
 		       csk, csk->state, csk->flags, csk->tid, req->status);
 
-	if (req->status == CPL_ERR_RTX_NEG_ADVICE ||
-	    req->status == CPL_ERR_PERSIST_NEG_ADVICE)
+	if (is_neg_adv(req->status))
 		goto rel_skb;
 
 	cxgbi_sock_get(csk);
@@ -989,6 +1031,27 @@ static void do_abort_rpl_rss(struct cxgbi_device *cdev, struct sk_buff *skb)
 	__kfree_skb(skb);
 }
 
+static void do_rx_data(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_rx_data *cpl = (struct cpl_rx_data *)skb->data;
+	unsigned int tid = GET_TID(cpl);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+
+	csk = lookup_tid(t, tid);
+	if (!csk) {
+		pr_err("can't find connection for tid %u.\n", tid);
+	} else {
+		/* not expecting this, reset the connection. */
+		pr_err("csk 0x%p, tid %u, rcv cpl_rx_data.\n", csk, tid);
+		spin_lock_bh(&csk->lock);
+		send_abort_req(csk);
+		spin_unlock_bh(&csk->lock);
+	}
+	__kfree_skb(skb);
+}
+
 static void do_rx_iscsi_hdr(struct cxgbi_device *cdev, struct sk_buff *skb)
 {
 	struct cxgbi_sock *csk;
@@ -1408,6 +1471,7 @@ cxgb4i_cplhandler_func cxgb4i_cplhandlers[NUM_CPL_CMDS] = {
 	[CPL_SET_TCB_RPL] = do_set_tcb_rpl,
 	[CPL_RX_DATA_DDP] = do_rx_data_ddp,
 	[CPL_RX_ISCSI_DDP] = do_rx_data_ddp,
+	[CPL_RX_DATA] = do_rx_data,
 };
 
 int cxgb4i_ofld_init(struct cxgbi_device *cdev)
@@ -1485,7 +1549,7 @@ static int ddp_ppod_write_idata(struct cxgbi_device *cdev, unsigned int port_id,
 		return -ENOMEM;
 	}
 	req = (struct ulp_mem_io *)skb->head;
-	set_queue(skb, CPL_PRIORITY_CONTROL, NULL);
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
 
 	ulp_mem_io_set_hdr(lldi, req, wr_len, dlen, pm_addr);
 	idata = (struct ulptx_idata *)(req + 1);
diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index 7da59c38a69ec97cf15507876122151222e52fe1..eb58afcfb73b4c7bbff39a33da272ab9d52e1c39 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -2294,10 +2294,12 @@ int cxgbi_conn_xmit_pdu(struct iscsi_task *task)
 		return err;
 	}
 
-	kfree_skb(skb);
 	log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_PDU_TX,
 		"itt 0x%x, skb 0x%p, len %u/%u, xmit err %d.\n",
 		task->itt, skb, skb->len, skb->data_len, err);
+
+	kfree_skb(skb);
+
 	iscsi_conn_printk(KERN_ERR, task->conn, "xmit err %d.\n", err);
 	iscsi_conn_failure(task->conn, ISCSI_ERR_XMIT_FAILED);
 	return err;
diff --git a/drivers/scsi/cxgbi/libcxgbi.h b/drivers/scsi/cxgbi/libcxgbi.h
index 2c7cb1c0c453d307418afc2eb4d73bb103c6c1f0..aba1af720df653548a6c1b03934780d804c05500 100644
--- a/drivers/scsi/cxgbi/libcxgbi.h
+++ b/drivers/scsi/cxgbi/libcxgbi.h
@@ -317,8 +317,8 @@ static inline void cxgbi_skcb_clear_flag(struct sk_buff *skb,
 	__clear_bit(flag, &(cxgbi_skcb_flags(skb)));
 }
 
-static inline int cxgbi_skcb_test_flag(struct sk_buff *skb,
-					enum cxgbi_skcb_flags flag)
+static inline int cxgbi_skcb_test_flag(const struct sk_buff *skb,
+				       enum cxgbi_skcb_flags flag)
 {
 	return test_bit(flag, &(cxgbi_skcb_flags(skb)));
 }
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 1402fa855388985282ff05e157b3388e85114d92..f5c40b0fadc2a50be563304727db2e7ad7fe6699 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -42,6 +42,14 @@
 #define wmb()	mb()
 #endif
 
+#ifndef dma_rmb
+#define dma_rmb()	rmb()
+#endif
+
+#ifndef dma_wmb
+#define dma_wmb()	wmb()
+#endif
+
 #ifndef read_barrier_depends
 #define read_barrier_depends()		do { } while (0)
 #endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 69517a24bc50678e4f2d69d931b9586cb76a9b69..d9b05b5bf8c7954f63ace548e0a6348f5fa0e327 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -556,12 +556,6 @@ static inline void tasklet_enable(struct tasklet_struct *t)
 	atomic_dec(&t->count);
 }
 
-static inline void tasklet_hi_enable(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic();
-	atomic_dec(&t->count);
-}
-
 extern void tasklet_kill(struct tasklet_struct *t);
 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
 extern void tasklet_init(struct tasklet_struct *t,
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 376805005cc7f8a6d18dab16229a2560d877ed19..118956448cf6e4aa9a3609ccf7455a04596e24bf 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -579,11 +579,12 @@ static int vlan_dev_init(struct net_device *dev)
 		      (1<<__LINK_STATE_PRESENT);
 
 	dev->hw_features = NETIF_F_ALL_CSUM | NETIF_F_SG |
-			   NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
+			   NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE |
 			   NETIF_F_HIGHDMA | NETIF_F_SCTP_CSUM |
 			   NETIF_F_ALL_FCOE;
 
-	dev->features |= real_dev->vlan_features | NETIF_F_LLTX;
+	dev->features |= real_dev->vlan_features | NETIF_F_LLTX |
+			 NETIF_F_GSO_SOFTWARE;
 	dev->gso_max_size = real_dev->gso_max_size;
 	if (dev->features & NETIF_F_VLAN_FEATURES)
 		netdev_warn(real_dev, "VLAN features are set incorrectly.  Q-in-Q configurations may not work correctly.\n");
@@ -648,7 +649,7 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev,
 	features |= NETIF_F_RXCSUM;
 	features = netdev_intersect_features(features, real_dev->features);
 
-	features |= old_features & NETIF_F_SOFT_FEATURES;
+	features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE);
 	features |= NETIF_F_LLTX;
 
 	return features;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 528380a3e296001b9c727e9f35e07f403d7f050b..515569ffde8a16af9eea82cc85ded4f8d5ce59cc 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -512,7 +512,7 @@ static int dsa_slave_fixed_link_update(struct net_device *dev,
 }
 
 /* slave device setup *******************************************************/
-static void dsa_slave_phy_setup(struct dsa_slave_priv *p,
+static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
 				struct net_device *slave_dev)
 {
 	struct dsa_switch *ds = p->parent;
@@ -533,7 +533,7 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p,
 		ret = of_phy_register_fixed_link(port_dn);
 		if (ret) {
 			netdev_err(slave_dev, "failed to register fixed PHY\n");
-			return;
+			return ret;
 		}
 		phy_is_fixed = true;
 		phy_dn = port_dn;
@@ -555,12 +555,17 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p,
 	 */
 	if (!p->phy) {
 		p->phy = ds->slave_mii_bus->phy_map[p->port];
+		if (!p->phy)
+			return -ENODEV;
+
 		phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link,
 				   p->phy_interface);
 	} else {
 		netdev_info(slave_dev, "attached PHY at address %d [%s]\n",
 			    p->phy->addr, p->phy->drv->name);
 	}
+
+	return 0;
 }
 
 int dsa_slave_suspend(struct net_device *slave_dev)
@@ -653,12 +658,17 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 	p->old_link = -1;
 	p->old_duplex = -1;
 
-	dsa_slave_phy_setup(p, slave_dev);
+	ret = dsa_slave_phy_setup(p, slave_dev);
+	if (ret) {
+		free_netdev(slave_dev);
+		return NULL;
+	}
 
 	ret = register_netdev(slave_dev);
 	if (ret) {
 		netdev_err(master, "error %d registering interface %s\n",
 			   ret, slave_dev->name);
+		phy_disconnect(p->phy);
 		free_netdev(slave_dev);
 		return NULL;
 	}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e9cb2588e4161c41613c0e50a17ea91b7416a514..18bcaf2ff2fd54627894f6ac28ebc07833739585 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1143,8 +1143,9 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 			put_child(tp, cindex, (struct rt_trie_node *)tn);
 		} else {
 			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
-			tp = tn;
 		}
+
+		tp = tn;
 	}
 
 	if (tp && tp->pos + tp->bits > 32)