deadlock_detector.c 6.94 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
/*
 * deadlock_detector.c  Detects potential deadlocks in a running process.
 *                      For Linux, uses BCC, eBPF. See .py file.
 *
 * Copyright 2017 Facebook, Inc.
 * Licensed under the Apache License, Version 2.0 (the "License")
 *
 * 1-Feb-2016   Kenny Yu   Created this.
 */

#include <linux/sched.h>
#include <uapi/linux/ptrace.h>

// Maximum number of mutexes a single thread can hold at once.
// If the number is too big, the unrolled loops wil cause the stack
// to be too big, and the bpf verifier will fail.
#define MAX_HELD_MUTEXES 16

// Info about held mutexes. `mutex` will be 0 if not held.
struct held_mutex_t {
  u64 mutex;
  u64 stack_id;
};

// List of mutexes that a thread is holding. Whenever we loop over this array,
// we need to force the compiler to unroll the loop, otherwise the bcc verifier
// will fail because the loop will create a backwards edge.
struct thread_to_held_mutex_leaf_t {
  struct held_mutex_t held_mutexes[MAX_HELD_MUTEXES];
};

// Map of thread ID -> array of (mutex addresses, stack id)
33
BPF_HASH(thread_to_held_mutexes, u32, struct thread_to_held_mutex_leaf_t, 2097152);
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49

// Key type for edges. Represents an edge from mutex1 => mutex2.
struct edges_key_t {
  u64 mutex1;
  u64 mutex2;
};

// Leaf type for edges. Holds information about where each mutex was acquired.
struct edges_leaf_t {
  u64 mutex1_stack_id;
  u64 mutex2_stack_id;
  u32 thread_pid;
  char comm[TASK_COMM_LEN];
};

// Represents all edges currently in the mutex wait graph.
50
BPF_HASH(edges, struct edges_key_t, struct edges_leaf_t, 2097152);
51 52 53 54 55 56 57 58 59

// Info about parent thread when a child thread is created.
struct thread_created_leaf_t {
  u64 stack_id;
  u32 parent_pid;
  char comm[TASK_COMM_LEN];
};

// Map of child thread pid -> info about parent thread.
60
BPF_HASH(thread_to_parent, u32, struct thread_created_leaf_t);
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206

// Stack traces when threads are created and when mutexes are locked/unlocked.
BPF_STACK_TRACE(stack_traces, 655360);

// The first argument to the user space function we are tracing
// is a pointer to the mutex M held by thread T.
//
// For all mutexes N held by mutexes_held[T]
//   add edge N => M (held by T)
// mutexes_held[T].add(M)
int trace_mutex_acquire(struct pt_regs *ctx, void *mutex_addr) {
  // Higher 32 bits is process ID, Lower 32 bits is thread ID
  u32 pid = bpf_get_current_pid_tgid();
  u64 mutex = (u64)mutex_addr;

  struct thread_to_held_mutex_leaf_t empty_leaf = {};
  struct thread_to_held_mutex_leaf_t *leaf =
      thread_to_held_mutexes.lookup_or_init(&pid, &empty_leaf);
  if (!leaf) {
    bpf_trace_printk(
        "could not add thread_to_held_mutex key, thread: %d, mutex: %p\n", pid,
        mutex);
    return 1; // Could not insert, no more memory
  }

  // Recursive mutexes lock the same mutex multiple times. We cannot tell if
  // the mutex is recursive after the mutex is already created. To avoid noisy
  // reports, disallow self edges. Do one pass to check if we are already
  // holding the mutex, and if we are, do nothing.
  #pragma unroll
  for (int i = 0; i < MAX_HELD_MUTEXES; ++i) {
    if (leaf->held_mutexes[i].mutex == mutex) {
      return 1; // Disallow self edges
    }
  }

  u64 stack_id =
      stack_traces.get_stackid(ctx, BPF_F_USER_STACK | BPF_F_REUSE_STACKID);

  int added_mutex = 0;
  #pragma unroll
  for (int i = 0; i < MAX_HELD_MUTEXES; ++i) {
    // If this is a free slot, see if we can insert.
    if (!leaf->held_mutexes[i].mutex) {
      if (!added_mutex) {
        leaf->held_mutexes[i].mutex = mutex;
        leaf->held_mutexes[i].stack_id = stack_id;
        added_mutex = 1;
      }
      continue; // Nothing to do for a free slot
    }

    // Add edges from held mutex => current mutex
    struct edges_key_t edge_key = {};
    edge_key.mutex1 = leaf->held_mutexes[i].mutex;
    edge_key.mutex2 = mutex;

    struct edges_leaf_t edge_leaf = {};
    edge_leaf.mutex1_stack_id = leaf->held_mutexes[i].stack_id;
    edge_leaf.mutex2_stack_id = stack_id;
    edge_leaf.thread_pid = pid;
    bpf_get_current_comm(&edge_leaf.comm, sizeof(edge_leaf.comm));

    // Returns non-zero on error
    int result = edges.update(&edge_key, &edge_leaf);
    if (result) {
      bpf_trace_printk("could not add edge key %p, %p, error: %d\n",
                       edge_key.mutex1, edge_key.mutex2, result);
      continue; // Could not insert, no more memory
    }
  }

  // There were no free slots for this mutex.
  if (!added_mutex) {
    bpf_trace_printk("could not add mutex %p, added_mutex: %d\n", mutex,
                     added_mutex);
    return 1;
  }
  return 0;
}

// The first argument to the user space function we are tracing
// is a pointer to the mutex M held by thread T.
//
// mutexes_held[T].remove(M)
int trace_mutex_release(struct pt_regs *ctx, void *mutex_addr) {
  // Higher 32 bits is process ID, Lower 32 bits is thread ID
  u32 pid = bpf_get_current_pid_tgid();
  u64 mutex = (u64)mutex_addr;

  struct thread_to_held_mutex_leaf_t *leaf =
      thread_to_held_mutexes.lookup(&pid);
  if (!leaf) {
    // If the leaf does not exist for the pid, then it means we either missed
    // the acquire event, or we had no more memory and could not add it.
    bpf_trace_printk(
        "could not find thread_to_held_mutex, thread: %d, mutex: %p\n", pid,
        mutex);
    return 1;
  }

  // For older kernels without "Bpf: allow access into map value arrays"
  // (https://lkml.org/lkml/2016/8/30/287) the bpf verifier will fail with an
  // invalid memory access on `leaf->held_mutexes[i]` below. On newer kernels,
  // we can avoid making this extra copy in `value` and use `leaf` directly.
  struct thread_to_held_mutex_leaf_t value = {};
  bpf_probe_read(&value, sizeof(struct thread_to_held_mutex_leaf_t), leaf);

  #pragma unroll
  for (int i = 0; i < MAX_HELD_MUTEXES; ++i) {
    // Find the current mutex (if it exists), and clear it.
    // Note: Can't use `leaf->` in this if condition, see comment above.
    if (value.held_mutexes[i].mutex == mutex) {
      leaf->held_mutexes[i].mutex = 0;
      leaf->held_mutexes[i].stack_id = 0;
    }
  }

  return 0;
}

// Trace return from clone() syscall in the child thread (return value > 0).
int trace_clone(struct pt_regs *ctx, unsigned long flags, void *child_stack,
                void *ptid, void *ctid, struct pt_regs *regs) {
  u32 child_pid = PT_REGS_RC(ctx);
  if (child_pid <= 0) {
    return 1;
  }

  struct thread_created_leaf_t thread_created_leaf = {};
  thread_created_leaf.parent_pid = bpf_get_current_pid_tgid();
  thread_created_leaf.stack_id =
      stack_traces.get_stackid(ctx, BPF_F_USER_STACK | BPF_F_REUSE_STACKID);
  bpf_get_current_comm(&thread_created_leaf.comm,
                       sizeof(thread_created_leaf.comm));

  struct thread_created_leaf_t *insert_result =
      thread_to_parent.lookup_or_init(&child_pid, &thread_created_leaf);
  if (!insert_result) {
    bpf_trace_printk(
        "could not add thread_created_key, child: %d, parent: %d\n", child_pid,
        thread_created_leaf.parent_pid);
    return 1; // Could not insert, no more memory
  }
  return 0;
}