diff --git a/arch/sparc64/lib/atomic.S b/arch/sparc64/lib/atomic.S
index 26463d8a467ae8cc0923a37b0d1719e15fb91d6d..0b6cd475557d58a8442660b4de969a9c1edd01ff 100644
--- a/arch/sparc64/lib/atomic.S
+++ b/arch/sparc64/lib/atomic.S
@@ -7,8 +7,22 @@
 #include <asm/asi.h>
 
 	.text
-	.align	64
 
+	/* We use these stubs for the uncommon case
+	 * of contention on the atomic value.  This is
+	 * so that we can keep the main fast path 8
+	 * instructions long and thus fit into a single
+	 * L2 cache line.
+	 */
+__atomic_add_membar:
+	ba,pt	%xcc, __atomic_add
+	 membar	#StoreLoad | #StoreStore
+
+__atomic_sub_membar:
+	ba,pt	%xcc, __atomic_sub
+	 membar	#StoreLoad | #StoreStore
+
+	.align	64
 	.globl	__atomic_add
 	.type	__atomic_add,#function
 __atomic_add: /* %o0 = increment, %o1 = atomic_ptr */
@@ -16,10 +30,10 @@ __atomic_add: /* %o0 = increment, %o1 = atomic_ptr */
 	add	%g5, %o0, %g7
 	cas	[%o1], %g5, %g7
 	cmp	%g5, %g7
-	bne,pn	%icc, __atomic_add
-	 membar	#StoreLoad | #StoreStore
+	bne,pn	%icc, __atomic_add_membar
+	 add	%g7, %o0, %g7
 	retl
-	 add	%g7, %o0, %o0
+	 sra	%g7, 0, %o0
 	.size	__atomic_add, .-__atomic_add
 
 	.globl	__atomic_sub
@@ -29,10 +43,10 @@ __atomic_sub: /* %o0 = increment, %o1 = atomic_ptr */
 	sub	%g5, %o0, %g7
 	cas	[%o1], %g5, %g7
 	cmp	%g5, %g7
-	bne,pn	%icc, __atomic_sub
-	 membar	#StoreLoad | #StoreStore
+	bne,pn	%icc, __atomic_sub_membar
+	 sub	%g7, %o0, %g7
 	retl
-	 sub	%g7, %o0, %o0
+	 sra	%g7, 0, %o0
 	.size	__atomic_sub, .-__atomic_sub
 
 	.globl	__atomic64_add