Merge bk://kernel.bkbits.net/davem/sparc-2.6

into ppc970.osdl.org:/home/torvalds/v2.6/linux

Merge bk://kernel.bkbits.net/davem/sparc-2.6
into ppc970.osdl.org:/home/torvalds/v2.6/linux
1581782e · Linus Torvalds · 94a75870 · 15acb7ea · 1581782e · 1581782e
Commit 1581782e authored Feb 05, 2005 by Linus Torvalds
12 changed files
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
+		Semantics and Behavior of Atomic and
+		         Bitmask Operations
+
+			  David S. Miller	 
+
+	This document is intended to serve as a guide to Linux port
+maintainers on how to implement atomic counter and bitops interfaces
+properly.
+
+	The atomic_t type should be defined as a signed integer.
+Also, it should be made opaque such that any kind of cast to a normal
+C integer type will fail.  Something like the following should
+suffice:
+
+	typedef struct { volatile int counter; } atomic_t;
+
+	The first operations to implement for atomic_t's are the
+initializers and plain reads.
+
+	#define ATOMIC_INIT(i)		{ (i) }
+	#define atomic_set(v, i)	((v)->counter = (i))
+
+The first macro is used in definitions, such as:
+
+static atomic_t my_counter = ATOMIC_INIT(1);
+
+The second interface can be used at runtime, as in:
+
+	struct foo { atomic_t counter; };
+	...
+
+	struct foo *k;
+
+	k = kmalloc(sizeof(*k), GFP_KERNEL);
+	if (!k)
+		return -ENOMEM;
+	atomic_set(&k->counter, 0);
+
+Next, we have:
+
+	#define atomic_read(v)	((v)->counter)
+
+which simply reads the current value of the counter.
+
+Now, we move onto the actual atomic operation interfaces.
+
+	void atomic_add(int i, atomic_t *v);
+	void atomic_sub(int i, atomic_t *v);
+	void atomic_inc(atomic_t *v);
+	void atomic_dec(atomic_t *v);
+
+These four routines add and subtract integral values to/from the given
+atomic_t value.  The first two routines pass explicit integers by
+which to make the adjustment, whereas the latter two use an implicit
+adjustment value of "1".
+
+One very important aspect of these two routines is that they DO NOT
+require any explicit memory barriers.  They need only perform the
+atomic_t counter update in an SMP safe manner.
+
+Next, we have:
+
+	int atomic_inc_return(atomic_t *v);
+	int atomic_dec_return(atomic_t *v);
+
+These routines add 1 and subtract 1, respectively, from the given
+atomic_t and return the new counter value after the operation is
+performed.
+
+Unlike the above routines, it is required that explicit memory
+barriers are performed before and after the operation.  It must be
+done such that all memory operations before and after the atomic
+operation calls are strongly ordered with respect to the atomic
+operation itself.
+
+For example, it should behave as if a smp_mb() call existed both
+before and after the atomic operation.
+
+If the atomic instructions used in an implementation provide explicit
+memory barrier semantics which satisfy the above requirements, that is
+fine as well.
+
+Let's move on:
+
+	int atomic_add_return(int i, atomic_t *v);
+	int atomic_sub_return(int i, atomic_t *v);
+
+These behave just like atomic_{inc,dec}_return() except that an
+explicit counter adjustment is given instead of the implicit "1".
+This means that like atomic_{inc,dec}_return(), the memory barrier
+semantics are required.
+
+Next:
+
+	int atomic_inc_and_test(atomic_t *v);
+	int atomic_dec_and_test(atomic_t *v);
+
+These two routines increment and decrement by 1, respectively, the
+given atomic counter.  They return a boolean indicating whether the
+resulting counter value was zero or not.
+
+It requires explicit memory barrier semantics around the operation as
+above.
+
+	int atomic_sub_and_test(int i, atomic_t *v);
+
+This is identical to atomic_dec_and_test() except that an explicit
+decrement is given instead of the implicit "1".  It requires explicit
+memory barrier semantics around the operation.
+
+	int atomic_add_negative(int i, atomic_t *v);
+
+The given increment is added to the given atomic counter value.  A
+boolean is return which indicates whether the resulting counter value
+is negative.  It requires explicit memory barrier semantics around the
+operation.
+
+If a caller requires memory barrier semantics around an atomic_t
+operation which does not return a value, a set of interfaces are
+defined which accomplish this:
+
+	void smb_mb__before_atomic_dec(void);
+	void smb_mb__after_atomic_dec(void);
+	void smb_mb__before_atomic_inc(void);
+	void smb_mb__after_atomic_dec(void);
+
+For example, smb_mb__before_atomic_dec() can be used like so:
+
+	obj->dead = 1;
+	smb_mb__before_atomic_dec();
+	atomic_dec(&obj->ref_count);
+
+It makes sure that all memory operations preceeding the atomic_dec()
+call are strongly ordered with respect to the atomic counter
+operation.  In the above example, it guarentees that the assignment of
+"1" to obj->dead will be globally visible to other cpus before the
+atomic counter decrement.
+
+Without the explicitl smb_mb__before_atomic_dec() call, the
+implementation could legally allow the atomic counter update visible
+to other cpus before the "obj->dead = 1;" assignment.
+
+The other three interfaces listed are used to provide explicit
+ordering with respect to memory operations after an atomic_dec() call
+(smb_mb__after_atomic_dec()) and around atomic_inc() calls
+(smb_mb__{before,after}_atomic_inc()).
+
+A missing memory barrier in the cases where they are required by the
+atomic_t implementation above can have disasterous results.  Here is
+an example, which follows a pattern occuring frequently in the Linux
+kernel.  It is the use of atomic counters to implement reference
+counting, and it works such that once the counter falls to zero it can
+be guarenteed that no other entity can be accessing the object:
+
+static void obj_list_add(struct obj *obj)
+{
+	obj->active = 1;
+	list_add(&obj->list);
+}
+
+static void obj_list_del(struct obj *obj)
+{
+	list_del(&obj->list);
+	obj->active = 0;
+}
+
+static void obj_destroy(struct obj *obj)
+{
+	BUG_ON(obj->active);
+	kfree(obj);
+}
+
+struct obj *obj_list_peek(struct list_head *head)
+{
+	if (!list_empty(head)) {
+		struct obj *obj;
+
+		obj = list_entry(head->next, struct obj, list);
+		atomic_inc(&obj->refcnt);
+		return obj;
+	}
+	return NULL;
+}
+
+void obj_poke(void)
+{
+	struct obj *obj;
+
+	spin_lock(&global_list_lock);
+	obj = obj_list_peek(&global_list);
+	spin_unlock(&global_list_lock);
+
+	if (obj) {
+		obj->ops->poke(obj);
+		if (atomic_dec_and_test(&obj->refcnt))
+			obj_destroy(obj);
+	}
+}
+
+void obj_timeout(struct obj *obj)
+{
+	spin_lock(&global_list_lock);
+	obj_list_del(obj);
+	spin_unlock(&global_list_lock);
+
+	if (atomic_dec_and_test(&obj->refcnt))
+		obj_destroy(obj);
+}
+
+(This is a simplification of the ARP queue management in the
+ generic neighbour discover code of the networking.  Olaf Kirch
+ found a bug wrt. memory barriers in kfree_skb() that exposed
+ the atomic_t memory barrier requirements quite clearly.)
+
+Given the above scheme, it must be the case that the obj->active
+update done by the obj list deletion be visible to other processors
+before the atomic counter decrement is performed.
+
+Otherwise, the counter could fall to zero, yet obj->active would still
+be set, thus triggering the assertion in obj_destroy().  The error
+sequence looks like this:
+
+	cpu 0				cpu 1
+	obj_poke()			obj_timeout()
+	obj = obj_list_peek();
+	... gains ref to obj, refcnt=2
+					obj_list_del(obj);
+					obj->active = 0 ...
+					... visibility delayed ...
+					atomic_dec_and_test()
+					... refcnt drops to 1 ...
+	atomic_dec_and_test()
+	... refcount drops to 0 ...
+	obj_destroy()
+	BUG() triggers since obj->active
+	still seen as one
+					obj->active update visibility occurs
+
+With the memory barrier semantics required of the atomic_t operations
+which return values, the above sequence of memory visibility can never
+happen.  Specifically, in the above case the atomic_dec_and_test()
+counter decrement would not become globally visible until the
+obj->active update does.
+
+We will now cover the atomic bitmask operations.  You will find that
+their SMP and memory barrier semantics are similar in shape and scope
+to the atomic_t ops above.
+
+Native atomic bit operations are defined to operate on objects aligned
+to the size of an "unsigned long" C data type, and are least of that
+size.  The endianness of the bits within each "unsigned long" are the
+native endianness of the cpu.
+
+	void set_bit(unsigned long nr, volatils unsigned long *addr);
+	void clear_bit(unsigned long nr, volatils unsigned long *addr);
+	void change_bit(unsigned long nr, volatils unsigned long *addr);
+
+These routines set, clear, and change, respectively, the bit number
+indicated by "nr" on the bit mask pointed to by "ADDR".
+
+They must execute atomically, yet there are no implicit memory barrier
+semantics required of these interfaces.
+
+	int test_and_set_bit(unsigned long nr, volatils unsigned long *addr);
+	int test_and_clear_bit(unsigned long nr, volatils unsigned long *addr);
+	int test_and_change_bit(unsigned long nr, volatils unsigned long *addr);
+
+Like the above, except that these routines return a boolean which
+indicates whether the changed bit was set _BEFORE_ the atomic bit
+operation.
+
+WARNING! It is incredibly important that the value be a boolean,
+ie. "0" or "1".  Do not try to be fancy and save a few instructions by
+declaring the above to return "long" and just returning something like
+"old_val & mask" because that will not work.
+
+For one thing, this return value gets truncated to int in many code
+paths using these interfaces, so on 64-bit if the bit is set in the
+upper 32-bits then testers will never see that.
+
+One great example of where this problem crops up are the thread_info
+flag operations.  Routines such as test_and_set_ti_thread_flag() chop
+the return value into an int.  There are other places where things
+like this occur as well.
+
+These routines, like the atomic_t counter operations returning values,
+require explicit memory barrier semantics around their execution.  All
+memory operations before the atomic bit operation call must be made
+visible globally before the atomic bit operation is made visible.
+Likewise, the atomic bit operation must be visible globally before any
+subsequent memory operation is made visible.  For example:
+
+	obj->dead = 1;
+	if (test_and_set_bit(0, &obj->flags))
+		/* ... */;
+	obj->killed = 1;
+
+The implementation of test_and_set_bit() must guarentee that
+"obj->dead = 1;" is visible to cpus before the atomic memory operation
+done by test_and_set_bit() becomes visible.  Likewise, the atomic
+memory operation done by test_and_set_bit() must become visible before
+"obj->killed = 1;" is visible.
+
+Finally there is the basic operation:
+
+	int test_bit(unsigned long nr, __const__ volatile unsigned long *addr);
+
+Which returns a boolean indicating if bit "nr" is set in the bitmask
+pointed to by "addr".
+
+If explicit memory barriers are required around clear_bit() (which
+does not return a value, and thus does not need to provide memory
+barrier semantics), two interfaces are provided:
+
+	void smp_mb__before_clear_bit(void);
+	void smp_mb__after_clear_bit(void);
+
+They are used as follows, and are akin to their atomic_t operation
+brothers:
+
+	/* All memory operations before this call will
+	 * be globally visible before the clear_bit().
+	 */
+	smp_mb__before_clear_bit();
+	clear_bit( ... );
+
+	/* The clear_bit() will be visible before all
+	 * subsequent memory operations.
+	 */
+	 smp_mb__after_clear_bit();
+
+Finally, there are non-atomic versions of the bitmask operations
+provided.  They are used in contexts where some other higher-level SMP
+locking scheme is being used to protect the bitmask, and thus less
+expensive non-atomic operations may be used in the implementation.
+They have names similar to the above bitmask operation interfaces,
+except that two underscores are prefixed to the interface name.
+
+	void __set_bit(unsigned long nr, volatile unsigned long *addr);
+	void __clear_bit(unsigned long nr, volatile unsigned long *addr);
+	void __change_bit(unsigned long nr, volatile unsigned long *addr);
+	int __test_and_set_bit(unsigned long nr, volatile unsigned long *addr);
+	int __test_and_clear_bit(unsigned long nr, volatile unsigned long *addr);
+	int __test_and_change_bit(unsigned long nr, volatile unsigned long *addr);
+
+These non-atomic variants also do not require any special memory
+barrier semantics.
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ b/arch/sparc64/kernel/binfmt_aout32.c
@@ -333,9 +333,8 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->start_stack =
 		(unsigned long) create_aout32_tables((char __user *)bprm->p, bprm);
 	if (!(orig_thr_flags & _TIF_32BIT)) {
-		unsigned long pgd_cache;
+		unsigned long pgd_cache = get_pgd_cache(current->mm->pgd);

-		pgd_cache = ((unsigned long)pgd_val(current->mm->pgd[0]))<<11;
 		__asm__ __volatile__("stxa\t%0, [%1] %2\n\t"
 				     "membar #Sync"
 				     : /* no outputs */

--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@@ -440,7 +440,7 @@ void flush_thread(void)
 				pmd_t *page = pmd_alloc_one(mm, 0);
 				pud_set(pud0, page);
 			}
-			pgd_cache = ((unsigned long) pud_val(*pud0)) << 11UL;
+			pgd_cache = get_pgd_cache(pgd0);
 		}
 		__asm__ __volatile__("stxa %0, [%1] %2\n\t"
 				     "membar #Sync"

--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -894,9 +894,8 @@ static unsigned long penguins_are_doing_time;

 void smp_capture(void)
 {
-	int result = __atomic_add(1, &smp_capture_depth);
+	int result = atomic_add_ret(1, &smp_capture_depth);

-	membar("#StoreStore | #LoadStore");
 	if (result == 1) {
 		int ncpus = num_online_cpus();


--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -172,18 +172,25 @@ EXPORT_SYMBOL(down_interruptible);
 EXPORT_SYMBOL(up);

 /* Atomic counter implementation. */
-EXPORT_SYMBOL(__atomic_add);
-EXPORT_SYMBOL(__atomic_sub);
-EXPORT_SYMBOL(__atomic64_add);
-EXPORT_SYMBOL(__atomic64_sub);
+EXPORT_SYMBOL(atomic_add);
+EXPORT_SYMBOL(atomic_add_ret);
+EXPORT_SYMBOL(atomic_sub);
+EXPORT_SYMBOL(atomic_sub_ret);
+EXPORT_SYMBOL(atomic64_add);
+EXPORT_SYMBOL(atomic64_add_ret);
+EXPORT_SYMBOL(atomic64_sub);
+EXPORT_SYMBOL(atomic64_sub_ret);
 #ifdef CONFIG_SMP
 EXPORT_SYMBOL(_atomic_dec_and_lock);
 #endif

 /* Atomic bit operations. */
-EXPORT_SYMBOL(___test_and_set_bit);
-EXPORT_SYMBOL(___test_and_clear_bit);
-EXPORT_SYMBOL(___test_and_change_bit);
+EXPORT_SYMBOL(test_and_set_bit);
+EXPORT_SYMBOL(test_and_clear_bit);
+EXPORT_SYMBOL(test_and_change_bit);
+EXPORT_SYMBOL(set_bit);
+EXPORT_SYMBOL(clear_bit);
+EXPORT_SYMBOL(change_bit);

 /* Bit searching */
 EXPORT_SYMBOL(find_next_bit);

--- a/arch/sparc64/lib/atomic.S
+++ b/arch/sparc64/lib/atomic.S
@@ -4,73 +4,136 @@
 * Copyright (C) 1999 David S. Miller (davem@redhat.com)
 */

+#include <linux/config.h>
 #include <asm/asi.h>

+	/* On SMP we need to use memory barriers to ensure
+	 * correct memory operation ordering, nop these out
+	 * for uniprocessor.
+	 */
+#ifdef CONFIG_SMP
+#define ATOMIC_PRE_BARRIER	membar #StoreLoad | #LoadLoad
+#define ATOMIC_POST_BARRIER	membar #StoreLoad | #StoreStore
+#else
+#define ATOMIC_PRE_BARRIER	nop
+#define ATOMIC_POST_BARRIER	nop
+#endif
+
 	.text

-	/* We use these stubs for the uncommon case
-	 * of contention on the atomic value.  This is
-	 * so that we can keep the main fast path 8
-	 * instructions long and thus fit into a single
-	 * L2 cache line.
+	/* Two versions of the atomic routines, one that
+	 * does not return a value and does not perform
+	 * memory barriers, and a second which returns
+	 * a value and does the barriers.
 	 */
-__atomic_add_membar:
-	ba,pt	%xcc, __atomic_add
-	 membar	#StoreLoad | #StoreStore
+	.globl	atomic_add
+	.type	atomic_add,#function
+atomic_add: /* %o0 = increment, %o1 = atomic_ptr */
+1:	lduw	[%o1], %g5
+	add	%g5, %o0, %g7
+	cas	[%o1], %g5, %g7
+	cmp	%g5, %g7
+	bne,pn	%icc, 1b
+	 nop
+	retl
+	 nop
+	.size	atomic_add, .-atomic_add

-__atomic_sub_membar:
-	ba,pt	%xcc, __atomic_sub
-	 membar	#StoreLoad | #StoreStore
+	.globl	atomic_sub
+	.type	atomic_sub,#function
+atomic_sub: /* %o0 = decrement, %o1 = atomic_ptr */
+1:	lduw	[%o1], %g5
+	sub	%g5, %o0, %g7
+	cas	[%o1], %g5, %g7
+	cmp	%g5, %g7
+	bne,pn	%icc, 1b
+	 nop
+	retl
+	 nop
+	.size	atomic_sub, .-atomic_sub

-	.align	64
-	.globl	__atomic_add
-	.type	__atomic_add,#function
-__atomic_add: /* %o0 = increment, %o1 = atomic_ptr */
-	lduw	[%o1], %g5
+	.globl	atomic_add_ret
+	.type	atomic_add_ret,#function
+atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
+	ATOMIC_PRE_BARRIER
+1:	lduw	[%o1], %g5
 	add	%g5, %o0, %g7
 	cas	[%o1], %g5, %g7
 	cmp	%g5, %g7
-	bne,pn	%icc, __atomic_add_membar
+	bne,pn	%icc, 1b
 	 add	%g7, %o0, %g7
+	ATOMIC_POST_BARRIER
 	retl
 	 sra	%g7, 0, %o0
-	.size	__atomic_add, .-__atomic_add
+	.size	atomic_add_ret, .-atomic_add_ret

-	.globl	__atomic_sub
-	.type	__atomic_sub,#function
-__atomic_sub: /* %o0 = increment, %o1 = atomic_ptr */
-	lduw	[%o1], %g5
+	.globl	atomic_sub_ret
+	.type	atomic_sub_ret,#function
+atomic_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */
+	ATOMIC_PRE_BARRIER
+1:	lduw	[%o1], %g5
 	sub	%g5, %o0, %g7
 	cas	[%o1], %g5, %g7
 	cmp	%g5, %g7
-	bne,pn	%icc, __atomic_sub_membar
+	bne,pn	%icc, 1b
 	 sub	%g7, %o0, %g7
+	ATOMIC_POST_BARRIER
 	retl
 	 sra	%g7, 0, %o0
-	.size	__atomic_sub, .-__atomic_sub
+	.size	atomic_sub_ret, .-atomic_sub_ret
+
+	.globl	atomic64_add
+	.type	atomic64_add,#function
+atomic64_add: /* %o0 = increment, %o1 = atomic_ptr */
+1:	ldx	[%o1], %g5
+	add	%g5, %o0, %g7
+	casx	[%o1], %g5, %g7
+	cmp	%g5, %g7
+	bne,pn	%xcc, 1b
+	 nop
+	retl
+	 nop
+	.size	atomic64_add, .-atomic64_add

-	.globl	__atomic64_add
-	.type	__atomic64_add,#function
-__atomic64_add: /* %o0 = increment, %o1 = atomic_ptr */
-	ldx	[%o1], %g5
+	.globl	atomic64_sub
+	.type	atomic64_sub,#function
+atomic64_sub: /* %o0 = decrement, %o1 = atomic_ptr */
+1:	ldx	[%o1], %g5
+	sub	%g5, %o0, %g7
+	casx	[%o1], %g5, %g7
+	cmp	%g5, %g7
+	bne,pn	%xcc, 1b
+	 nop
+	retl
+	 nop
+	.size	atomic64_sub, .-atomic64_sub
+
+	.globl	atomic64_add_ret
+	.type	atomic64_add_ret,#function
+atomic64_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
+	ATOMIC_PRE_BARRIER
+1:	ldx	[%o1], %g5
 	add	%g5, %o0, %g7
 	casx	[%o1], %g5, %g7
 	cmp	%g5, %g7
-	bne,pn	%xcc, __atomic64_add
-	 membar	#StoreLoad | #StoreStore
+	bne,pn	%xcc, 1b
+	 add	%g7, %o0, %g7
+	ATOMIC_POST_BARRIER
 	retl
-	 add	%g7, %o0, %o0
-	.size	__atomic64_add, .-__atomic64_add
+	 mov	%g7, %o0
+	.size	atomic64_add_ret, .-atomic64_add_ret

-	.globl	__atomic64_sub
-	.type	__atomic64_sub,#function
-__atomic64_sub: /* %o0 = increment, %o1 = atomic_ptr */
-	ldx	[%o1], %g5
+	.globl	atomic64_sub_ret
+	.type	atomic64_sub_ret,#function
+atomic64_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */
+	ATOMIC_PRE_BARRIER
+1:	ldx	[%o1], %g5
 	sub	%g5, %o0, %g7
 	casx	[%o1], %g5, %g7
 	cmp	%g5, %g7
-	bne,pn	%xcc, __atomic64_sub
-	 membar	#StoreLoad | #StoreStore
+	bne,pn	%xcc, 1b
+	 sub	%g7, %o0, %g7
+	ATOMIC_POST_BARRIER
 	retl
-	 sub	%g7, %o0, %o0
-	.size	__atomic64_sub, .-__atomic64_sub
+	 mov	%g7, %o0
+	.size	atomic64_sub_ret, .-atomic64_sub_ret
--- a/arch/sparc64/lib/bitops.S
+++ b/arch/sparc64/lib/bitops.S
@@ -4,69 +4,142 @@
 * Copyright (C) 2000 David S. Miller (davem@redhat.com)
 */

+#include <linux/config.h>
 #include <asm/asi.h>

+	/* On SMP we need to use memory barriers to ensure
+	 * correct memory operation ordering, nop these out
+	 * for uniprocessor.
+	 */
+#ifdef CONFIG_SMP
+#define BITOP_PRE_BARRIER	membar #StoreLoad | #LoadLoad
+#define BITOP_POST_BARRIER	membar #StoreLoad | #StoreStore
+#else
+#define BITOP_PRE_BARRIER	nop
+#define BITOP_POST_BARRIER	nop
+#endif
+
 	.text
-	.align	64
-	.globl	___test_and_set_bit
-	.type	___test_and_set_bit,#function
-___test_and_set_bit:	/* %o0=nr, %o1=addr */
+
+	.globl	test_and_set_bit
+	.type	test_and_set_bit,#function
+test_and_set_bit:	/* %o0=nr, %o1=addr */
+	BITOP_PRE_BARRIER
+	srlx	%o0, 6, %g1
+	mov	1, %g5
+	sllx	%g1, 3, %g3
+	and	%o0, 63, %g2
+	sllx	%g5, %g2, %g5
+	add	%o1, %g3, %o1
+1:	ldx	[%o1], %g7
+	or	%g7, %g5, %g1
+	casx	[%o1], %g7, %g1
+	cmp	%g7, %g1
+	bne,pn	%xcc, 1b
+	 and	%g7, %g5, %g2
+	BITOP_POST_BARRIER
+	clr	%o0
+	retl
+	 movrne	%g2, 1, %o0
+	.size	test_and_set_bit, .-test_and_set_bit
+
+	.globl	test_and_clear_bit
+	.type	test_and_clear_bit,#function
+test_and_clear_bit:	/* %o0=nr, %o1=addr */
+	BITOP_PRE_BARRIER
+	srlx	%o0, 6, %g1
+	mov	1, %g5
+	sllx	%g1, 3, %g3
+	and	%o0, 63, %g2
+	sllx	%g5, %g2, %g5
+	add	%o1, %g3, %o1
+1:	ldx	[%o1], %g7
+	andn	%g7, %g5, %g1
+	casx	[%o1], %g7, %g1
+	cmp	%g7, %g1
+	bne,pn	%xcc, 1b
+	 and	%g7, %g5, %g2
+	BITOP_POST_BARRIER
+	clr	%o0
+	retl
+	 movrne	%g2, 1, %o0
+	.size	test_and_clear_bit, .-test_and_clear_bit
+
+	.globl	test_and_change_bit
+	.type	test_and_change_bit,#function
+test_and_change_bit:	/* %o0=nr, %o1=addr */
+	BITOP_PRE_BARRIER
+	srlx	%o0, 6, %g1
+	mov	1, %g5
+	sllx	%g1, 3, %g3
+	and	%o0, 63, %g2
+	sllx	%g5, %g2, %g5
+	add	%o1, %g3, %o1
+1:	ldx	[%o1], %g7
+	xor	%g7, %g5, %g1
+	casx	[%o1], %g7, %g1
+	cmp	%g7, %g1
+	bne,pn	%xcc, 1b
+	 and	%g7, %g5, %g2
+	BITOP_POST_BARRIER
+	clr	%o0
+	retl
+	 movrne	%g2, 1, %o0
+	.size	test_and_change_bit, .-test_and_change_bit
+
+	.globl	set_bit
+	.type	set_bit,#function
+set_bit:		/* %o0=nr, %o1=addr */
 	srlx	%o0, 6, %g1
 	mov	1, %g5
 	sllx	%g1, 3, %g3
 	and	%o0, 63, %g2
 	sllx	%g5, %g2, %g5
 	add	%o1, %g3, %o1
-	ldx	[%o1], %g7
-1:	andcc	%g7, %g5, %o0
-	bne,pn	%xcc, 2f
-	 xor	%g7, %g5, %g1
+1:	ldx	[%o1], %g7
+	or	%g7, %g5, %g1
 	casx	[%o1], %g7, %g1
 	cmp	%g7, %g1
-	bne,a,pn %xcc, 1b
-	 ldx	[%o1], %g7
-2:	retl
-	 membar	#StoreLoad | #StoreStore
-	.size	___test_and_set_bit, .-___test_and_set_bit
+	bne,pn	%xcc, 1b
+	 nop
+	retl
+	 nop
+	.size	set_bit, .-set_bit

-	.globl	___test_and_clear_bit
-	.type	___test_and_clear_bit,#function
-___test_and_clear_bit:	/* %o0=nr, %o1=addr */
+	.globl	clear_bit
+	.type	clear_bit,#function
+clear_bit:		/* %o0=nr, %o1=addr */
 	srlx	%o0, 6, %g1
 	mov	1, %g5
 	sllx	%g1, 3, %g3
 	and	%o0, 63, %g2
 	sllx	%g5, %g2, %g5
 	add	%o1, %g3, %o1
-	ldx	[%o1], %g7
-1:	andcc	%g7, %g5, %o0
-	be,pn	%xcc, 2f
-	 xor	%g7, %g5, %g1
+1:	ldx	[%o1], %g7
+	andn	%g7, %g5, %g1
 	casx	[%o1], %g7, %g1
 	cmp	%g7, %g1
-	bne,a,pn %xcc, 1b
-	 ldx	[%o1], %g7
-2:	retl
-	 membar	#StoreLoad | #StoreStore
-	.size	___test_and_clear_bit, .-___test_and_clear_bit
+	bne,pn	%xcc, 1b
+	 nop
+	retl
+	 nop
+	.size	clear_bit, .-clear_bit

-	.globl	___test_and_change_bit
-	.type	___test_and_change_bit,#function
-___test_and_change_bit:	/* %o0=nr, %o1=addr */
+	.globl	change_bit
+	.type	change_bit,#function
+change_bit:		/* %o0=nr, %o1=addr */
 	srlx	%o0, 6, %g1
 	mov	1, %g5
 	sllx	%g1, 3, %g3
 	and	%o0, 63, %g2
 	sllx	%g5, %g2, %g5
 	add	%o1, %g3, %o1
-	ldx	[%o1], %g7
-1:	and	%g7, %g5, %o0
+1:	ldx	[%o1], %g7
 	xor	%g7, %g5, %g1
 	casx	[%o1], %g7, %g1
 	cmp	%g7, %g1
-	bne,a,pn %xcc, 1b
-	 ldx	[%o1], %g7
-2:	retl
-	 membar	#StoreLoad | #StoreStore
-	nop
-	.size	___test_and_change_bit, .-___test_and_change_bit
+	bne,pn	%xcc, 1b
+	 nop
+	retl
+	 nop
+	.size	change_bit, .-change_bit
--- a/drivers/video/cg14.c
+++ b/drivers/video/cg14.c
@@ -469,9 +469,9 @@ static void cg14_init_one(struct sbus_dev *sdev, int node, int parent_node)
 	int is_8mb, linebytes, i;

 	if (!sdev) {
-		prom_getproperty(node, "address",
-				 (char *) &bases[0], sizeof(bases));
-		if (!bases[0]) {
+		if (prom_getproperty(node, "address",
+				     (char *) &bases[0], sizeof(bases)) <= 0
+		    || !bases[0]) {
 			printk(KERN_ERR "cg14: Device is not mapped.\n");
 			return;
 		}

--- a/include/asm-sparc64/atomic.h
+++ b/include/asm-sparc64/atomic.h
@@ -8,6 +8,7 @@
 #ifndef __ARCH_SPARC64_ATOMIC__
 #define __ARCH_SPARC64_ATOMIC__

+#include <linux/config.h>
 #include <linux/types.h>

 typedef struct { volatile int counter; } atomic_t;
@@ -22,29 +23,27 @@ typedef struct { volatile __s64 counter; } atomic64_t;
 #define atomic_set(v, i)	(((v)->counter) = i)
 #define atomic64_set(v, i)	(((v)->counter) = i)

-extern int __atomic_add(int, atomic_t *);
-extern int __atomic64_add(__s64, atomic64_t *);
+extern void atomic_add(int, atomic_t *);
+extern void atomic64_add(int, atomic64_t *);
+extern void atomic_sub(int, atomic_t *);
+extern void atomic64_sub(int, atomic64_t *);

-extern int __atomic_sub(int, atomic_t *);
-extern int __atomic64_sub(__s64, atomic64_t *);
+extern int atomic_add_ret(int, atomic_t *);
+extern int atomic64_add_ret(int, atomic64_t *);
+extern int atomic_sub_ret(int, atomic_t *);
+extern int atomic64_sub_ret(int, atomic64_t *);

-#define atomic_add(i, v) ((void)__atomic_add(i, v))
-#define atomic64_add(i, v) ((void)__atomic64_add(i, v))
+#define atomic_dec_return(v) atomic_sub_ret(1, v)
+#define atomic64_dec_return(v) atomic64_sub_ret(1, v)

-#define atomic_sub(i, v) ((void)__atomic_sub(i, v))
-#define atomic64_sub(i, v) ((void)__atomic64_sub(i, v))
+#define atomic_inc_return(v) atomic_add_ret(1, v)
+#define atomic64_inc_return(v) atomic64_add_ret(1, v)

-#define atomic_dec_return(v) __atomic_sub(1, v)
-#define atomic64_dec_return(v) __atomic64_sub(1, v)
+#define atomic_sub_return(i, v) atomic_sub_ret(i, v)
+#define atomic64_sub_return(i, v) atomic64_sub_ret(i, v)

-#define atomic_inc_return(v) __atomic_add(1, v)
-#define atomic64_inc_return(v) __atomic64_add(1, v)
-
-#define atomic_sub_return(i, v) __atomic_sub(i, v)
-#define atomic64_sub_return(i, v) __atomic64_sub(i, v)
-
-#define atomic_add_return(i, v) __atomic_add(i, v)
-#define atomic64_add_return(i, v) __atomic64_add(i, v)
+#define atomic_add_return(i, v) atomic_add_ret(i, v)
+#define atomic64_add_return(i, v) atomic64_add_ret(i, v)

 /*
 * atomic_inc_and_test - increment and test
@@ -56,25 +55,32 @@ extern int __atomic64_sub(__s64, atomic64_t *);
 */
 #define atomic_inc_and_test(v) (atomic_inc_return(v) == 0)

-#define atomic_sub_and_test(i, v) (__atomic_sub(i, v) == 0)
-#define atomic64_sub_and_test(i, v) (__atomic64_sub(i, v) == 0)
+#define atomic_sub_and_test(i, v) (atomic_sub_ret(i, v) == 0)
+#define atomic64_sub_and_test(i, v) (atomic64_sub_ret(i, v) == 0)

-#define atomic_dec_and_test(v) (__atomic_sub(1, v) == 0)
-#define atomic64_dec_and_test(v) (__atomic64_sub(1, v) == 0)
+#define atomic_dec_and_test(v) (atomic_sub_ret(1, v) == 0)
+#define atomic64_dec_and_test(v) (atomic64_sub_ret(1, v) == 0)

-#define atomic_inc(v) ((void)__atomic_add(1, v))
-#define atomic64_inc(v) ((void)__atomic64_add(1, v))
+#define atomic_inc(v) atomic_add(1, v)
+#define atomic64_inc(v) atomic64_add(1, v)

-#define atomic_dec(v) ((void)__atomic_sub(1, v))
-#define atomic64_dec(v) ((void)__atomic64_sub(1, v))
+#define atomic_dec(v) atomic_sub(1, v)
+#define atomic64_dec(v) atomic64_sub(1, v)

-#define atomic_add_negative(i, v) (__atomic_add(i, v) < 0)
-#define atomic64_add_negative(i, v) (__atomic64_add(i, v) < 0)
+#define atomic_add_negative(i, v) (atomic_add_ret(i, v) < 0)
+#define atomic64_add_negative(i, v) (atomic64_add_ret(i, v) < 0)

 /* Atomic operations are already serializing */
+#ifdef CONFIG_SMP
+#define smp_mb__before_atomic_dec()	membar("#StoreLoad | #LoadLoad")
+#define smp_mb__after_atomic_dec()	membar("#StoreLoad | #StoreStore")
+#define smp_mb__before_atomic_inc()	membar("#StoreLoad | #LoadLoad")
+#define smp_mb__after_atomic_inc()	membar("#StoreLoad | #StoreStore")
+#else
 #define smp_mb__before_atomic_dec()	barrier()
 #define smp_mb__after_atomic_dec()	barrier()
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
+#endif

 #endif /* !(__ARCH_SPARC64_ATOMIC__) */
--- a/include/asm-sparc64/bitops.h
+++ b/include/asm-sparc64/bitops.h
@@ -7,19 +7,16 @@
 #ifndef _SPARC64_BITOPS_H
 #define _SPARC64_BITOPS_H

+#include <linux/config.h>
 #include <linux/compiler.h>
 #include <asm/byteorder.h>

-extern long ___test_and_set_bit(unsigned long nr, volatile unsigned long *addr);
-extern long ___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr);
-extern long ___test_and_change_bit(unsigned long nr, volatile unsigned long *addr);
-
-#define test_and_set_bit(nr,addr)	({___test_and_set_bit(nr,addr)!=0;})
-#define test_and_clear_bit(nr,addr)	({___test_and_clear_bit(nr,addr)!=0;})
-#define test_and_change_bit(nr,addr)	({___test_and_change_bit(nr,addr)!=0;})
-#define set_bit(nr,addr)		((void)___test_and_set_bit(nr,addr))
-#define clear_bit(nr,addr)		((void)___test_and_clear_bit(nr,addr))
-#define change_bit(nr,addr)		((void)___test_and_change_bit(nr,addr))
+extern int test_and_set_bit(unsigned long nr, volatile unsigned long *addr);
+extern int test_and_clear_bit(unsigned long nr, volatile unsigned long *addr);
+extern int test_and_change_bit(unsigned long nr, volatile unsigned long *addr);
+extern void set_bit(unsigned long nr, volatile unsigned long *addr);
+extern void clear_bit(unsigned long nr, volatile unsigned long *addr);
+extern void change_bit(unsigned long nr, volatile unsigned long *addr);

 /* "non-atomic" versions... */

@@ -74,8 +71,13 @@ static __inline__ int __test_and_change_bit(int nr, volatile unsigned long *addr
 	return ((old & mask) != 0);
 }

-#define smp_mb__before_clear_bit()	do { } while(0)
-#define smp_mb__after_clear_bit()	do { } while(0)
+#ifdef CONFIG_SMP
+#define smp_mb__before_clear_bit()	membar("#StoreLoad | #LoadLoad")
+#define smp_mb__after_clear_bit()	membar("#StoreLoad | #StoreStore")
+#else
+#define smp_mb__before_clear_bit()	barrier()
+#define smp_mb__after_clear_bit()	barrier()
+#endif

 static __inline__ int test_bit(int nr, __const__ volatile unsigned long *addr)
 {
@@ -230,9 +232,9 @@ extern unsigned long find_next_zero_bit(const unsigned long *,
        find_next_zero_bit((addr), (size), 0)

 #define test_and_set_le_bit(nr,addr)	\
-	({ ___test_and_set_bit((nr) ^ 0x38, (addr)) != 0; })
+	test_and_set_bit((nr) ^ 0x38, (addr))
 #define test_and_clear_le_bit(nr,addr)	\
-	({ ___test_and_clear_bit((nr) ^ 0x38, (addr)) != 0; })
+	test_and_clear_bit((nr) ^ 0x38, (addr))

 static __inline__ int test_le_bit(int nr, __const__ unsigned long * addr)
 {
@@ -251,12 +253,21 @@ extern unsigned long find_next_zero_le_bit(unsigned long *, unsigned long, unsig

 #ifdef __KERNEL__

+#define __set_le_bit(nr, addr) \
+	__set_bit((nr) ^ 0x38, (addr))
+#define __clear_le_bit(nr, addr) \
+	__clear_bit((nr) ^ 0x38, (addr))
+#define __test_and_clear_le_bit(nr, addr) \
+	__test_and_clear_bit((nr) ^ 0x38, (addr))
+#define __test_and_set_le_bit(nr, addr) \
+	__test_and_set_bit((nr) ^ 0x38, (addr))
+
 #define ext2_set_bit(nr,addr)	\
-	test_and_set_le_bit((nr),(unsigned long *)(addr))
+	__test_and_set_le_bit((nr),(unsigned long *)(addr))
 #define ext2_set_bit_atomic(lock,nr,addr) \
 	test_and_set_le_bit((nr),(unsigned long *)(addr))
 #define ext2_clear_bit(nr,addr)	\
-	test_and_clear_le_bit((nr),(unsigned long *)(addr))
+	__test_and_clear_le_bit((nr),(unsigned long *)(addr))
 #define ext2_clear_bit_atomic(lock,nr,addr) \
 	test_and_clear_le_bit((nr),(unsigned long *)(addr))
 #define ext2_test_bit(nr,addr)	\

--- a/include/asm-sparc64/mmu_context.h
+++ b/include/asm-sparc64/mmu_context.h
@@ -83,8 +83,7 @@ do { \
 	paddr = __pa((__mm)->pgd); \
 	pgd_cache = 0UL; \
 	if ((__tsk)->thread_info->flags & _TIF_32BIT) \
-		pgd_cache = \
-		  ((unsigned long)pgd_val((__mm)->pgd[0])) << 11UL; \
+		pgd_cache = get_pgd_cache((__mm)->pgd); \
 	__asm__ __volatile__("wrpr	%%g0, 0x494, %%pstate\n\t" \
 			     "mov	%3, %%g4\n\t" \
 			     "mov	%0, %%g7\n\t" \

--- a/include/asm-sparc64/pgtable.h
+++ b/include/asm-sparc64/pgtable.h
@@ -312,6 +312,11 @@ static inline pte_t pte_modify(pte_t orig_pte, pgprot_t new_prot)
 /* to find an entry in a kernel page-table-directory */
 #define pgd_offset_k(address) pgd_offset(&init_mm, address)

+/* extract the pgd cache used for optimizing the tlb miss
+ * slow path when executing 32-bit compat processes
+ */
+#define get_pgd_cache(pgd)	((unsigned long) pgd_val(*pgd) << 11)
+
 /* Find an entry in the second-level page table.. */
 #define pmd_offset(pudp, address)	\
 	((pmd_t *) pud_page(*(pudp)) + \