Allow BKL re-acquire to fail, causing us to re-schedule.

This allows for low-latency BKL contention even with preemption. Previously, since preemption is disabled over context switches, re-acquiring the kernel lock when resuming a process would be non-preemtible.

Allow BKL re-acquire to fail, causing us to re-schedule.
This allows for low-latency BKL contention even with preemption. Previously, since preemption is disabled over context switches, re-acquiring the kernel lock when resuming a process would be non-preemtible.
59a7718e · Linus Torvalds · 6f60f5cf · 59a7718e · 59a7718e · 59a7718e
Commit 59a7718e authored Oct 25, 2004 by Linus Torvalds
Show whitespace changes
Inline Side-by-side

Showing with 58 additions and 17 deletions

include/linux/smp_lock.h include/linux/smp_lock.h +19 -5

kernel/sched.c kernel/sched.c +5 -2

lib/kernel_lock.c lib/kernel_lock.c +34 -10

No files found.
--- a/include/linux/smp_lock.h
+++ b/include/linux/smp_lock.h
@@ -9,7 +9,7 @@
 #define kernel_locked()		(current->lock_depth >= 0)
-extern void __lockfunc get_kernel_lock(void);
+extern int __lockfunc get_kernel_lock(void);
 extern void __lockfunc put_kernel_lock(void);
 /*
@@ -20,10 +20,24 @@ extern void __lockfunc put_kernel_lock(void);
 		put_kernel_lock();		\
 } while (0)
-#define reacquire_kernel_lock(tsk) do {	\
+/*
-	if (unlikely((tsk)->lock_depth >= 0))	\
+ * Non-SMP kernels will never block on the kernel lock,
-		get_kernel_lock();		\
+ * so we are better off returning a constant zero from
-} while (0)
+ * reacquire_kernel_lock() so that the compiler can see
+ * it at compile-time.
+ */
+#ifdef CONFIG_SMP
+#define return_value_on_smp return
+#else
+#define return_value_on_smp
+#endif
+static inline int reacquire_kernel_lock(struct task_struct *task)
+{
+	if (unlikely(task->lock_depth >= 0))
+		return_value_on_smp get_kernel_lock();
+	return 0;
+}
 extern void __lockfunc lock_kernel(void)	__acquires(kernel_lock);
 extern void __lockfunc unlock_kernel(void)	__releases(kernel_lock);

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2502,6 +2502,8 @@ asmlinkage void __sched schedule(void)
 need_resched:
 	preempt_disable();
 	prev = current;
+	release_kernel_lock(prev);
+need_resched_nonpreemptible:
 	rq = this_rq();
 	/*
@@ -2513,7 +2515,6 @@ asmlinkage void __sched schedule(void)
 		dump_stack();
 	}
-	release_kernel_lock(prev);
 	schedstat_inc(rq, sched_cnt);
 	now = sched_clock();
 	if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
@@ -2636,7 +2637,9 @@ asmlinkage void __sched schedule(void)
 	} else
 		spin_unlock_irq(&rq->lock);
-	reacquire_kernel_lock(current);
+	prev = current;
+	if (unlikely(reacquire_kernel_lock(prev) < 0))
+		goto need_resched_nonpreemptible;
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;

--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -24,16 +24,40 @@ static spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 /*
 * Acquire/release the underlying lock from the scheduler.
 *
- * The scheduler release and re-acquire currently always happen
+ * This is called with preemption disabled, and should
- * with preemption disabled. Which is likely a bug in the acquire
+ * return an error value if it cannot get the lock and
- * case...
+ * TIF_NEED_RESCHED gets set.
 *
- * Regardless, we try to be polite about preemption. If SMP is
+ * If it successfully gets the lock, it should increment
- * not on (ie UP preemption), this all goes away because the
+ * the preemption count like any spinlock does.
+ *
+ * (This works on UP too - _raw_spin_trylock will never
+ * return false in that case)
+ */
+int __lockfunc get_kernel_lock(void)
+{
+	while (!_raw_spin_trylock(&kernel_flag)) {
+		if (test_thread_flag(TIF_NEED_RESCHED))
+			return -EAGAIN;
+		cpu_relax();
+	}
+	preempt_disable();
+	return 0;
+}
+void __lockfunc put_kernel_lock(void)
+{
+	_raw_spin_unlock(&kernel_flag);
+	preempt_enable_no_resched();
+}
+/*
+ * These are the BKL spinlocks - we try to be polite about preemption. 
+ * If SMP is not on (ie UP preemption), this all goes away because the
 * _raw_spin_trylock() will always succeed.
 */
 #ifdef CONFIG_PREEMPT
-inline void __lockfunc get_kernel_lock(void)
+static inline void __lock_kernel(void)
 {
 	preempt_disable();
 	if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
@@ -65,13 +89,13 @@ inline void __lockfunc get_kernel_lock(void)
 /*
 * Non-preemption case - just get the spinlock
 */
-inline void __lockfunc get_kernel_lock(void)
+static inline void __lock_kernel(void)
 {
 	_raw_spin_lock(&kernel_flag);
 }
 #endif
-inline void __lockfunc put_kernel_lock(void)
+static inline void __unlock_kernel(void)
 {
 	_raw_spin_unlock(&kernel_flag);
 	preempt_enable();
@@ -87,7 +111,7 @@ void __lockfunc lock_kernel(void)
 {
 	int depth = current->lock_depth+1;
 	if (likely(!depth))
-		get_kernel_lock();
+		__lock_kernel();
 	current->lock_depth = depth;
 }
@@ -95,7 +119,7 @@ void __lockfunc unlock_kernel(void)
 {
 	BUG_ON(current->lock_depth < 0);
 	if (likely(--current->lock_depth < 0))
-		put_kernel_lock();
+		__unlock_kernel();
 }
 EXPORT_SYMBOL(lock_kernel);