perf lock contention: Use per-cpu array map for spinlocks

Currently lock contention timestamp is maintained in a hash map keyed by pid. That means it needs to get and release a map element (which is proctected by spinlock!) on each contention begin and end pair. This can impact on performance if there are a lot of contention (usually from spinlocks). It used to go with task local storage but it had an issue on memory allocation in some critical paths. Although it's addressed in recent kernels IIUC, the tool should support old kernels too. So it cannot simply switch to the task local storage at least for now. As spinlocks create lots of contention and they disabled preemption during the spinning, it can use per-cpu array to keep the timestamp to avoid overhead in hashmap update and delete. In contention_begin, it's easy to check the lock types since it can see the flags. But contention_end cannot see it. So let's try to per-cpu array first (unconditionally) if it has an active element (lock != 0). Then it should be used and per-task tstamp map should not be used until the per-cpu array element is cleared which means nested spinlock contention (if any) was finished and it nows see (the outer) lock. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Acked-by: Ian Rogers <irogers@google.com> Cc: Hao Luo <haoluo@google.com> Cc: Song Liu <song@kernel.org> Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20231020204741.1869520-3-namhyung@kernel.org

perf lock contention: Use per-cpu array map for spinlocks
Currently lock contention timestamp is maintained in a hash map keyed by pid. That means it needs to get and release a map element (which is proctected by spinlock!) on each contention begin and end pair. This can impact on performance if there are a lot of contention (usually from spinlocks). It used to go with task local storage but it had an issue on memory allocation in some critical paths. Although it's addressed in recent kernels IIUC, the tool should support old kernels too. So it cannot simply switch to the task local storage at least for now. As spinlocks create lots of contention and they disabled preemption during the spinning, it can use per-cpu array to keep the timestamp to avoid overhead in hashmap update and delete. In contention_begin, it's easy to check the lock types since it can see the flags. But contention_end cannot see it. So let's try to per-cpu array first (unconditionally) if it has an active element (lock != 0). Then it should be used and per-task tstamp map should not be used until the per-cpu array element is cleared which means nested spinlock contention (if any) was finished and it nows see (the outer) lock. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Acked-by: Ian Rogers <irogers@google.com> Cc: Hao Luo <haoluo@google.com> Cc: Song Liu <song@kernel.org> Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20231020204741.1869520-3-namhyung@kernel.org
b5711042 · Namhyung Kim · 6a070573 · b5711042
Commit b5711042 authored Oct 20, 2023 by Namhyung Kim
Hide whitespace changes
Inline Side-by-side

Showing with 72 additions and 17 deletions

tools/perf/util/bpf_skel/lock_contention.bpf.c tools/perf/util/bpf_skel/lock_contention.bpf.c +72 -17

No files found.
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -42,6 +42,14 @@ struct {
 	__uint(max_entries, MAX_ENTRIES);
 } tstamp SEC(".maps");

+/* maintain per-CPU timestamp at the beginning of contention */
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct tstamp_data));
+	__uint(max_entries, 1);
+} tstamp_cpu SEC(".maps");
+
 /* actual lock contention statistics */
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
@@ -311,34 +319,57 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags)
 	return 0;
 }

-SEC("tp_btf/contention_begin")
-int contention_begin(u64 *ctx)
+static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
 {
 	__u32 pid;
 	struct tstamp_data *pelem;

-	if (!enabled || !can_record(ctx))
-		return 0;
+	/* Use per-cpu array map for spinlock and rwlock */
+	if (flags == (LCB_F_SPIN | LCB_F_READ) || flags == LCB_F_SPIN ||
+	    flags == (LCB_F_SPIN | LCB_F_WRITE)) {
+		__u32 idx = 0;
+
+		pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
+		/* Do not update the element for nested locks */
+		if (pelem && pelem->lock)
+			pelem = NULL;
+		return pelem;
+	}

 	pid = bpf_get_current_pid_tgid();
 	pelem = bpf_map_lookup_elem(&tstamp, &pid);
+	/* Do not update the element for nested locks */
 	if (pelem && pelem->lock)
-		return 0;
+		return NULL;

 	if (pelem == NULL) {
 		struct tstamp_data zero = {};

 		if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) {
 			__sync_fetch_and_add(&task_fail, 1);
-			return 0;
+			return NULL;
 		}

 		pelem = bpf_map_lookup_elem(&tstamp, &pid);
 		if (pelem == NULL) {
 			__sync_fetch_and_add(&task_fail, 1);
-			return 0;
+			return NULL;
 		}
 	}
+	return pelem;
+}
+
+SEC("tp_btf/contention_begin")
+int contention_begin(u64 *ctx)
+{
+	struct tstamp_data *pelem;
+
+	if (!enabled || !can_record(ctx))
+		return 0;
+
+	pelem = get_tstamp_elem(ctx[1]);
+	if (pelem == NULL)
+		return 0;

 	pelem->timestamp = bpf_ktime_get_ns();
 	pelem->lock = (__u64)ctx[0];
@@ -377,24 +408,42 @@ int contention_begin(u64 *ctx)
 SEC("tp_btf/contention_end")
 int contention_end(u64 *ctx)
 {
-	__u32 pid;
+	__u32 pid = 0, idx = 0;
 	struct tstamp_data *pelem;
 	struct contention_key key = {};
 	struct contention_data *data;
 	__u64 duration;
+	bool need_delete = false;

 	if (!enabled)
 		return 0;

-	pid = bpf_get_current_pid_tgid();
-	pelem = bpf_map_lookup_elem(&tstamp, &pid);
-	if (!pelem || pelem->lock != ctx[0])
-		return 0;
+	/*
+	 * For spinlock and rwlock, it needs to get the timestamp for the
+	 * per-cpu map.  However, contention_end does not have the flags
+	 * so it cannot know whether it reads percpu or hash map.
+	 *
+	 * Try per-cpu map first and check if there's active contention.
+	 * If it is, do not read hash map because it cannot go to sleeping
+	 * locks before releasing the spinning locks.
+	 */
+	pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
+	if (pelem && pelem->lock) {
+		if (pelem->lock != ctx[0])
+			return 0;
+	} else {
+		pid = bpf_get_current_pid_tgid();
+		pelem = bpf_map_lookup_elem(&tstamp, &pid);
+		if (!pelem || pelem->lock != ctx[0])
+			return 0;
+		need_delete = true;
+	}

 	duration = bpf_ktime_get_ns() - pelem->timestamp;
 	if ((__s64)duration < 0) {
 		pelem->lock = 0;
-		bpf_map_delete_elem(&tstamp, &pid);
+		if (need_delete)
+			bpf_map_delete_elem(&tstamp, &pid);
 		__sync_fetch_and_add(&time_fail, 1);
 		return 0;
 	}
@@ -406,8 +455,11 @@ int contention_end(u64 *ctx)
 	case LOCK_AGGR_TASK:
 		if (lock_owner)
 			key.pid = pelem->flags;
-		else
+		else {
+			if (!need_delete)
+				pid = bpf_get_current_pid_tgid();
 			key.pid = pid;
+		}
 		if (needs_callstack)
 			key.stack_id = pelem->stack_id;
 		break;
@@ -428,7 +480,8 @@ int contention_end(u64 *ctx)
 	if (!data) {
 		if (data_map_full) {
 			pelem->lock = 0;
-			bpf_map_delete_elem(&tstamp, &pid);
+			if (need_delete)
+				bpf_map_delete_elem(&tstamp, &pid);
 			__sync_fetch_and_add(&data_fail, 1);
 			return 0;
 		}
@@ -452,7 +505,8 @@ int contention_end(u64 *ctx)
 			__sync_fetch_and_add(&data_fail, 1);
 		}
 		pelem->lock = 0;
-		bpf_map_delete_elem(&tstamp, &pid);
+		if (need_delete)
+			bpf_map_delete_elem(&tstamp, &pid);
 		return 0;
 	}

@@ -466,7 +520,8 @@ int contention_end(u64 *ctx)
 		data->min_time = duration;

 	pelem->lock = 0;
-	bpf_map_delete_elem(&tstamp, &pid);
+	if (need_delete)
+		bpf_map_delete_elem(&tstamp, &pid);
 	return 0;
 }