Commit 441875b2 authored by John Levon's avatar John Levon Committed by David S. Miller

[PATCH] Fix oprofile on UP, small additional fix

The below has been in -mm for a while, and has been tested on my UP
and 2-way machines.

OProfile was completely unsafe on UP - a spinlock is no protection
against the NMI arriving and putting data into the buffer. Pretty stupid
bug. This fixes it by implementing reader/writer windows in the buffer
and removing the lock altogether. This patch was originally done by Will
Cohen.

It also fixes the oops Dave Hansen saw on 2.5.62 SMP
parent 8f2df25d
...@@ -58,7 +58,7 @@ static void nmi_save_registers(struct op_msrs * msrs) ...@@ -58,7 +58,7 @@ static void nmi_save_registers(struct op_msrs * msrs)
unsigned int const nr_ctrls = model->num_controls; unsigned int const nr_ctrls = model->num_controls;
struct op_msr_group * counters = &msrs->counters; struct op_msr_group * counters = &msrs->counters;
struct op_msr_group * controls = &msrs->controls; struct op_msr_group * controls = &msrs->controls;
int i; unsigned int i;
for (i = 0; i < nr_ctrs; ++i) { for (i = 0; i < nr_ctrs; ++i) {
rdmsr(counters->addrs[i], rdmsr(counters->addrs[i],
...@@ -108,7 +108,7 @@ static void nmi_restore_registers(struct op_msrs * msrs) ...@@ -108,7 +108,7 @@ static void nmi_restore_registers(struct op_msrs * msrs)
unsigned int const nr_ctrls = model->num_controls; unsigned int const nr_ctrls = model->num_controls;
struct op_msr_group * counters = &msrs->counters; struct op_msr_group * counters = &msrs->counters;
struct op_msr_group * controls = &msrs->controls; struct op_msr_group * controls = &msrs->controls;
int i; unsigned int i;
for (i = 0; i < nr_ctrls; ++i) { for (i = 0; i < nr_ctrls; ++i) {
wrmsr(controls->addrs[i], wrmsr(controls->addrs[i],
...@@ -182,7 +182,7 @@ struct op_counter_config counter_config[OP_MAX_COUNTER]; ...@@ -182,7 +182,7 @@ struct op_counter_config counter_config[OP_MAX_COUNTER];
static int nmi_create_files(struct super_block * sb, struct dentry * root) static int nmi_create_files(struct super_block * sb, struct dentry * root)
{ {
int i; unsigned int i;
for (i = 0; i < model->num_counters; ++i) { for (i = 0; i < model->num_counters; ++i) {
struct dentry * dir; struct dentry * dir;
......
...@@ -389,7 +389,7 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT]; ...@@ -389,7 +389,7 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT];
static void p4_fill_in_addresses(struct op_msrs * const msrs) static void p4_fill_in_addresses(struct op_msrs * const msrs)
{ {
int i; unsigned int i;
unsigned int addr, stag; unsigned int addr, stag;
setup_num_counters(); setup_num_counters();
......
...@@ -277,10 +277,7 @@ static void release_mm(struct mm_struct * mm) ...@@ -277,10 +277,7 @@ static void release_mm(struct mm_struct * mm)
*/ */
static struct mm_struct * take_task_mm(struct task_struct * task) static struct mm_struct * take_task_mm(struct task_struct * task)
{ {
struct mm_struct * mm; struct mm_struct * mm = task->mm;
task_lock(task);
mm = task->mm;
task_unlock(task);
/* if task->mm !NULL, mm_count must be at least 1. It cannot /* if task->mm !NULL, mm_count must be at least 1. It cannot
* drop to 0 without the task exiting, which will have to sleep * drop to 0 without the task exiting, which will have to sleep
...@@ -310,6 +307,32 @@ static inline int is_ctx_switch(unsigned long val) ...@@ -310,6 +307,32 @@ static inline int is_ctx_switch(unsigned long val)
} }
/* compute number of filled slots in cpu_buffer queue */
static unsigned long nr_filled_slots(struct oprofile_cpu_buffer * b)
{
unsigned long head = b->head_pos;
unsigned long tail = b->tail_pos;
if (head >= tail)
return head - tail;
return head + (b->buffer_size - tail);
}
static void increment_tail(struct oprofile_cpu_buffer * b)
{
unsigned long new_tail = b->tail_pos + 1;
rmb();
if (new_tail < (b->buffer_size))
b->tail_pos = new_tail;
else
b->tail_pos = 0;
}
/* Sync one of the CPU's buffers into the global event buffer. /* Sync one of the CPU's buffers into the global event buffer.
* Here we need to go through each batch of samples punctuated * Here we need to go through each batch of samples punctuated
* by context switch notes, taking the task's mmap_sem and doing * by context switch notes, taking the task's mmap_sem and doing
...@@ -322,10 +345,14 @@ static void sync_buffer(struct oprofile_cpu_buffer * cpu_buf) ...@@ -322,10 +345,14 @@ static void sync_buffer(struct oprofile_cpu_buffer * cpu_buf)
struct task_struct * new; struct task_struct * new;
unsigned long cookie; unsigned long cookie;
int in_kernel = 1; int in_kernel = 1;
int i; unsigned int i;
for (i=0; i < cpu_buf->pos; ++i) { /* Remember, only we can modify tail_pos */
struct op_sample * s = &cpu_buf->buffer[i];
unsigned long const available_elements = nr_filled_slots(cpu_buf);
for (i=0; i < available_elements; ++i) {
struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos];
if (is_ctx_switch(s->eip)) { if (is_ctx_switch(s->eip)) {
if (s->event <= 1) { if (s->event <= 1) {
...@@ -345,6 +372,8 @@ static void sync_buffer(struct oprofile_cpu_buffer * cpu_buf) ...@@ -345,6 +372,8 @@ static void sync_buffer(struct oprofile_cpu_buffer * cpu_buf)
} else { } else {
add_sample(mm, s, in_kernel); add_sample(mm, s, in_kernel);
} }
increment_tail(cpu_buf);
} }
release_mm(mm); release_mm(mm);
...@@ -369,17 +398,8 @@ static void sync_cpu_buffers(void) ...@@ -369,17 +398,8 @@ static void sync_cpu_buffers(void)
cpu_buf = &cpu_buffer[i]; cpu_buf = &cpu_buffer[i];
/* We take a spin lock even though we might
* sleep. It's OK because other users are try
* lockers only, and this region is already
* protected by buffer_sem. It's raw to prevent
* the preempt bogometer firing. Fruity, huh ? */
if (cpu_buf->pos > 0) {
_raw_spin_lock(&cpu_buf->int_lock);
add_cpu_switch(i); add_cpu_switch(i);
sync_buffer(cpu_buf); sync_buffer(cpu_buf);
_raw_spin_unlock(&cpu_buf->int_lock);
}
} }
up(&buffer_sem); up(&buffer_sem);
......
...@@ -26,8 +26,6 @@ ...@@ -26,8 +26,6 @@
struct oprofile_cpu_buffer cpu_buffer[NR_CPUS] __cacheline_aligned; struct oprofile_cpu_buffer cpu_buffer[NR_CPUS] __cacheline_aligned;
static unsigned long buffer_size;
static void __free_cpu_buffers(int num) static void __free_cpu_buffers(int num)
{ {
int i; int i;
...@@ -47,7 +45,7 @@ int alloc_cpu_buffers(void) ...@@ -47,7 +45,7 @@ int alloc_cpu_buffers(void)
{ {
int i; int i;
buffer_size = fs_cpu_buffer_size; unsigned long buffer_size = fs_cpu_buffer_size;
for (i=0; i < NR_CPUS; ++i) { for (i=0; i < NR_CPUS; ++i) {
struct oprofile_cpu_buffer * b = &cpu_buffer[i]; struct oprofile_cpu_buffer * b = &cpu_buffer[i];
...@@ -59,12 +57,12 @@ int alloc_cpu_buffers(void) ...@@ -59,12 +57,12 @@ int alloc_cpu_buffers(void)
if (!b->buffer) if (!b->buffer)
goto fail; goto fail;
spin_lock_init(&b->int_lock);
b->pos = 0;
b->last_task = 0; b->last_task = 0;
b->last_is_kernel = -1; b->last_is_kernel = -1;
b->buffer_size = buffer_size;
b->tail_pos = 0;
b->head_pos = 0;
b->sample_received = 0; b->sample_received = 0;
b->sample_lost_locked = 0;
b->sample_lost_overflow = 0; b->sample_lost_overflow = 0;
b->sample_lost_task_exit = 0; b->sample_lost_task_exit = 0;
} }
...@@ -81,10 +79,40 @@ void free_cpu_buffers(void) ...@@ -81,10 +79,40 @@ void free_cpu_buffers(void)
} }
/* Note we can't use a semaphore here as this is supposed to /* compute number of available slots in cpu_buffer queue */
* be safe from any context. Instead we trylock the CPU's int_lock. static unsigned long nr_available_slots(struct oprofile_cpu_buffer const * b)
* int_lock is taken by the processing code in sync_cpu_buffers() {
* so we avoid disturbing that. unsigned long head = b->head_pos;
unsigned long tail = b->tail_pos;
if (tail == head)
return b->buffer_size;
if (tail > head)
return tail - head;
return tail + (b->buffer_size - head);
}
static void increment_head(struct oprofile_cpu_buffer * b)
{
unsigned long new_head = b->head_pos + 1;
/* Ensure anything written to the slot before we
* increment is visible */
wmb();
if (new_head < (b->buffer_size))
b->head_pos = new_head;
else
b->head_pos = 0;
}
/* This must be safe from any context. It's safe writing here
* because of the head/tail separation of the writer and reader
* of the CPU buffer.
* *
* is_kernel is needed because on some architectures you cannot * is_kernel is needed because on some architectures you cannot
* tell if you are in kernel or user space simply by looking at * tell if you are in kernel or user space simply by looking at
...@@ -101,14 +129,10 @@ void oprofile_add_sample(unsigned long eip, unsigned int is_kernel, ...@@ -101,14 +129,10 @@ void oprofile_add_sample(unsigned long eip, unsigned int is_kernel,
cpu_buf->sample_received++; cpu_buf->sample_received++;
if (!spin_trylock(&cpu_buf->int_lock)) {
cpu_buf->sample_lost_locked++;
return;
}
if (cpu_buf->pos > buffer_size - 2) { if (nr_available_slots(cpu_buf) < 3) {
cpu_buf->sample_lost_overflow++; cpu_buf->sample_lost_overflow++;
goto out; return;
} }
task = current; task = current;
...@@ -116,18 +140,18 @@ void oprofile_add_sample(unsigned long eip, unsigned int is_kernel, ...@@ -116,18 +140,18 @@ void oprofile_add_sample(unsigned long eip, unsigned int is_kernel,
/* notice a switch from user->kernel or vice versa */ /* notice a switch from user->kernel or vice versa */
if (cpu_buf->last_is_kernel != is_kernel) { if (cpu_buf->last_is_kernel != is_kernel) {
cpu_buf->last_is_kernel = is_kernel; cpu_buf->last_is_kernel = is_kernel;
cpu_buf->buffer[cpu_buf->pos].eip = ~0UL; cpu_buf->buffer[cpu_buf->head_pos].eip = ~0UL;
cpu_buf->buffer[cpu_buf->pos].event = is_kernel; cpu_buf->buffer[cpu_buf->head_pos].event = is_kernel;
cpu_buf->pos++; increment_head(cpu_buf);
} }
/* notice a task switch */ /* notice a task switch */
if (cpu_buf->last_task != task) { if (cpu_buf->last_task != task) {
cpu_buf->last_task = task; cpu_buf->last_task = task;
if (!(task->flags & PF_EXITING)) { if (!(task->flags & PF_EXITING)) {
cpu_buf->buffer[cpu_buf->pos].eip = ~0UL; cpu_buf->buffer[cpu_buf->head_pos].eip = ~0UL;
cpu_buf->buffer[cpu_buf->pos].event = (unsigned long)task; cpu_buf->buffer[cpu_buf->head_pos].event = (unsigned long)task;
cpu_buf->pos++; increment_head(cpu_buf);
} }
} }
...@@ -138,23 +162,20 @@ void oprofile_add_sample(unsigned long eip, unsigned int is_kernel, ...@@ -138,23 +162,20 @@ void oprofile_add_sample(unsigned long eip, unsigned int is_kernel,
*/ */
if (task->flags & PF_EXITING) { if (task->flags & PF_EXITING) {
cpu_buf->sample_lost_task_exit++; cpu_buf->sample_lost_task_exit++;
goto out; return;
} }
cpu_buf->buffer[cpu_buf->pos].eip = eip; cpu_buf->buffer[cpu_buf->head_pos].eip = eip;
cpu_buf->buffer[cpu_buf->pos].event = event; cpu_buf->buffer[cpu_buf->head_pos].event = event;
cpu_buf->pos++; increment_head(cpu_buf);
out:
spin_unlock(&cpu_buf->int_lock);
} }
/* resets the cpu buffer to a sane state - should be called with /* resets the cpu buffer to a sane state - should be called with
* cpu_buf->int_lock held * cpu_buf->int_lock held
*/ */
void cpu_buffer_reset(struct oprofile_cpu_buffer *cpu_buf) void cpu_buffer_reset(struct oprofile_cpu_buffer *cpu_buf)
{ {
cpu_buf->pos = 0;
/* reset these to invalid values; the next sample /* reset these to invalid values; the next sample
* collected will populate the buffer with proper * collected will populate the buffer with proper
* values to initialize the buffer * values to initialize the buffer
...@@ -162,4 +183,3 @@ void cpu_buffer_reset(struct oprofile_cpu_buffer *cpu_buf) ...@@ -162,4 +183,3 @@ void cpu_buffer_reset(struct oprofile_cpu_buffer *cpu_buf)
cpu_buf->last_is_kernel = -1; cpu_buf->last_is_kernel = -1;
cpu_buf->last_task = 0; cpu_buf->last_task = 0;
} }
...@@ -30,14 +30,13 @@ struct op_sample { ...@@ -30,14 +30,13 @@ struct op_sample {
}; };
struct oprofile_cpu_buffer { struct oprofile_cpu_buffer {
spinlock_t int_lock; volatile unsigned long head_pos;
/* protected by int_lock */ volatile unsigned long tail_pos;
unsigned long pos; unsigned long buffer_size;
struct task_struct * last_task; struct task_struct * last_task;
int last_is_kernel; int last_is_kernel;
struct op_sample * buffer; struct op_sample * buffer;
unsigned long sample_received; unsigned long sample_received;
unsigned long sample_lost_locked;
unsigned long sample_lost_overflow; unsigned long sample_lost_overflow;
unsigned long sample_lost_task_exit; unsigned long sample_lost_task_exit;
} ____cacheline_aligned; } ____cacheline_aligned;
......
...@@ -27,7 +27,6 @@ void oprofile_reset_stats(void) ...@@ -27,7 +27,6 @@ void oprofile_reset_stats(void)
cpu_buf = &cpu_buffer[i]; cpu_buf = &cpu_buffer[i];
cpu_buf->sample_received = 0; cpu_buf->sample_received = 0;
cpu_buf->sample_lost_locked = 0;
cpu_buf->sample_lost_overflow = 0; cpu_buf->sample_lost_overflow = 0;
cpu_buf->sample_lost_task_exit = 0; cpu_buf->sample_lost_task_exit = 0;
} }
...@@ -63,8 +62,6 @@ void oprofile_create_stats_files(struct super_block * sb, struct dentry * root) ...@@ -63,8 +62,6 @@ void oprofile_create_stats_files(struct super_block * sb, struct dentry * root)
*/ */
oprofilefs_create_ro_ulong(sb, cpudir, "sample_received", oprofilefs_create_ro_ulong(sb, cpudir, "sample_received",
&cpu_buf->sample_received); &cpu_buf->sample_received);
oprofilefs_create_ro_ulong(sb, cpudir, "sample_lost_locked",
&cpu_buf->sample_lost_locked);
oprofilefs_create_ro_ulong(sb, cpudir, "sample_lost_overflow", oprofilefs_create_ro_ulong(sb, cpudir, "sample_lost_overflow",
&cpu_buf->sample_lost_overflow); &cpu_buf->sample_lost_overflow);
oprofilefs_create_ro_ulong(sb, cpudir, "sample_lost_task_exit", oprofilefs_create_ro_ulong(sb, cpudir, "sample_lost_task_exit",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment