Commit 96200591 authored by Ingo Molnar's avatar Ingo Molnar

Merge branch 'tracing/hw-breakpoints' into perf/core

Conflicts:
	arch/x86/kernel/kprobes.c
	kernel/trace/Makefile

Merge reason: hw-breakpoints perf integration is looking
              good in testing and in reviews, plus conflicts
              are mounting up - so merge & resolve.
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parents 7031281e 68efa37d
...@@ -126,4 +126,11 @@ config HAVE_DMA_API_DEBUG ...@@ -126,4 +126,11 @@ config HAVE_DMA_API_DEBUG
config HAVE_DEFAULT_NO_SPIN_MUTEXES config HAVE_DEFAULT_NO_SPIN_MUTEXES
bool bool
config HAVE_HW_BREAKPOINT
bool
depends on HAVE_PERF_EVENTS
select ANON_INODES
select PERF_EVENTS
source "kernel/gcov/Kconfig" source "kernel/gcov/Kconfig"
...@@ -49,6 +49,7 @@ config X86 ...@@ -49,6 +49,7 @@ config X86
select HAVE_KERNEL_GZIP select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA select HAVE_KERNEL_LZMA
select HAVE_HW_BREAKPOINT
select HAVE_ARCH_KMEMCHECK select HAVE_ARCH_KMEMCHECK
config OUTPUT_FORMAT config OUTPUT_FORMAT
......
...@@ -10,6 +10,7 @@ header-y += ptrace-abi.h ...@@ -10,6 +10,7 @@ header-y += ptrace-abi.h
header-y += sigcontext32.h header-y += sigcontext32.h
header-y += ucontext.h header-y += ucontext.h
header-y += processor-flags.h header-y += processor-flags.h
header-y += hw_breakpoint.h
unifdef-y += e820.h unifdef-y += e820.h
unifdef-y += ist.h unifdef-y += ist.h
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <linux/user.h> #include <linux/user.h>
#include <linux/elfcore.h> #include <linux/elfcore.h>
#include <asm/debugreg.h>
/* /*
* fill in the user structure for an a.out core dump * fill in the user structure for an a.out core dump
...@@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump) ...@@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
>> PAGE_SHIFT; >> PAGE_SHIFT;
dump->u_dsize -= dump->u_tsize; dump->u_dsize -= dump->u_tsize;
dump->u_ssize = 0; dump->u_ssize = 0;
dump->u_debugreg[0] = current->thread.debugreg0; aout_dump_debugregs(dump);
dump->u_debugreg[1] = current->thread.debugreg1;
dump->u_debugreg[2] = current->thread.debugreg2;
dump->u_debugreg[3] = current->thread.debugreg3;
dump->u_debugreg[4] = 0;
dump->u_debugreg[5] = 0;
dump->u_debugreg[6] = current->thread.debugreg6;
dump->u_debugreg[7] = current->thread.debugreg7;
if (dump->start_stack < TASK_SIZE) if (dump->start_stack < TASK_SIZE)
dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack)) dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#define DR_TRAP1 (0x2) /* db1 */ #define DR_TRAP1 (0x2) /* db1 */
#define DR_TRAP2 (0x4) /* db2 */ #define DR_TRAP2 (0x4) /* db2 */
#define DR_TRAP3 (0x8) /* db3 */ #define DR_TRAP3 (0x8) /* db3 */
#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
#define DR_STEP (0x4000) /* single-step */ #define DR_STEP (0x4000) /* single-step */
#define DR_SWITCH (0x8000) /* task switch */ #define DR_SWITCH (0x8000) /* task switch */
...@@ -49,6 +50,8 @@ ...@@ -49,6 +50,8 @@
#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ #define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ #define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */
#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */
#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ #define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ #define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
...@@ -67,4 +70,34 @@ ...@@ -67,4 +70,34 @@
#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */ #define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */ #define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
/*
* HW breakpoint additions
*/
#ifdef __KERNEL__
DECLARE_PER_CPU(unsigned long, dr7);
static inline void hw_breakpoint_disable(void)
{
/* Zero the control register for HW Breakpoint */
set_debugreg(0UL, 7);
/* Zero-out the individual HW breakpoint address registers */
set_debugreg(0UL, 0);
set_debugreg(0UL, 1);
set_debugreg(0UL, 2);
set_debugreg(0UL, 3);
}
static inline int hw_breakpoint_active(void)
{
return __get_cpu_var(dr7) & DR_GLOBAL_ENABLE_MASK;
}
extern void aout_dump_debugregs(struct user *dump);
extern void hw_breakpoint_restore(void);
#endif /* __KERNEL__ */
#endif /* _ASM_X86_DEBUGREG_H */ #endif /* _ASM_X86_DEBUGREG_H */
#ifndef _I386_HW_BREAKPOINT_H
#define _I386_HW_BREAKPOINT_H
#ifdef __KERNEL__
#define __ARCH_HW_BREAKPOINT_H
/*
* The name should probably be something dealt in
* a higher level. While dealing with the user
* (display/resolving)
*/
struct arch_hw_breakpoint {
char *name; /* Contains name of the symbol to set bkpt */
unsigned long address;
u8 len;
u8 type;
};
#include <linux/kdebug.h>
#include <linux/percpu.h>
#include <linux/list.h>
/* Available HW breakpoint length encodings */
#define X86_BREAKPOINT_LEN_1 0x40
#define X86_BREAKPOINT_LEN_2 0x44
#define X86_BREAKPOINT_LEN_4 0x4c
#define X86_BREAKPOINT_LEN_EXECUTE 0x40
#ifdef CONFIG_X86_64
#define X86_BREAKPOINT_LEN_8 0x48
#endif
/* Available HW breakpoint type encodings */
/* trigger on instruction execute */
#define X86_BREAKPOINT_EXECUTE 0x80
/* trigger on memory write */
#define X86_BREAKPOINT_WRITE 0x81
/* trigger on memory read or write */
#define X86_BREAKPOINT_RW 0x83
/* Total number of available HW breakpoint registers */
#define HBP_NUM 4
struct perf_event;
struct pmu;
extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
struct task_struct *tsk);
extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
unsigned long val, void *data);
int arch_install_hw_breakpoint(struct perf_event *bp);
void arch_uninstall_hw_breakpoint(struct perf_event *bp);
void hw_breakpoint_pmu_read(struct perf_event *bp);
void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
extern void
arch_fill_perf_breakpoint(struct perf_event *bp);
unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
extern int arch_bp_generic_fields(int x86_len, int x86_type,
int *gen_len, int *gen_type);
extern struct pmu perf_ops_bp;
#endif /* __KERNEL__ */
#endif /* _I386_HW_BREAKPOINT_H */
...@@ -30,6 +30,7 @@ struct mm_struct; ...@@ -30,6 +30,7 @@ struct mm_struct;
#include <linux/math64.h> #include <linux/math64.h>
#include <linux/init.h> #include <linux/init.h>
#define HBP_NUM 4
/* /*
* Default implementation of macro that returns current * Default implementation of macro that returns current
* instruction pointer ("program counter"). * instruction pointer ("program counter").
...@@ -422,6 +423,8 @@ extern unsigned int xstate_size; ...@@ -422,6 +423,8 @@ extern unsigned int xstate_size;
extern void free_thread_xstate(struct task_struct *); extern void free_thread_xstate(struct task_struct *);
extern struct kmem_cache *task_xstate_cachep; extern struct kmem_cache *task_xstate_cachep;
struct perf_event;
struct thread_struct { struct thread_struct {
/* Cached TLS descriptors: */ /* Cached TLS descriptors: */
struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
...@@ -443,13 +446,10 @@ struct thread_struct { ...@@ -443,13 +446,10 @@ struct thread_struct {
unsigned long fs; unsigned long fs;
#endif #endif
unsigned long gs; unsigned long gs;
/* Hardware debugging registers: */ /* Save middle states of ptrace breakpoints */
unsigned long debugreg0; struct perf_event *ptrace_bps[HBP_NUM];
unsigned long debugreg1; /* Debug status used for traps, single steps, etc... */
unsigned long debugreg2;
unsigned long debugreg3;
unsigned long debugreg6; unsigned long debugreg6;
unsigned long debugreg7;
/* Fault info: */ /* Fault info: */
unsigned long cr2; unsigned long cr2;
unsigned long trap_no; unsigned long trap_no;
......
...@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o ...@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
obj-y += bootflag.o e820.o obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
obj-y += tsc.o io_delay.o rtc.o obj-y += tsc.o io_delay.o rtc.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
......
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) 2007 Alan Stern
* Copyright (C) 2009 IBM Corporation
* Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
*/
/*
* HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
* using the CPU's debug registers.
*/
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <linux/irqflags.h>
#include <linux/notifier.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/percpu.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/smp.h>
#include <asm/hw_breakpoint.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
/* Per cpu debug control register value */
DEFINE_PER_CPU(unsigned long, dr7);
EXPORT_PER_CPU_SYMBOL(dr7);
/* Per cpu debug address registers values */
static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
/*
* Stores the breakpoints currently in use on each breakpoint address
* register for each cpus
*/
static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
/*
* Encode the length, type, Exact, and Enable bits for a particular breakpoint
* as stored in debug register 7.
*/
unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
{
unsigned long bp_info;
bp_info = (len | type) & 0xf;
bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) |
DR_GLOBAL_SLOWDOWN;
return bp_info;
}
/*
* Decode the length and type bits for a particular breakpoint as
* stored in debug register 7. Return the "enabled" status.
*/
int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
{
int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
*len = (bp_info & 0xc) | 0x40;
*type = (bp_info & 0x3) | 0x80;
return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
}
/*
* Install a perf counter breakpoint.
*
* We seek a free debug address register and use it for this
* breakpoint. Eventually we enable it in the debug control register.
*
* Atomic: we hold the counter->ctx->lock and we only handle variables
* and registers local to this cpu.
*/
int arch_install_hw_breakpoint(struct perf_event *bp)
{
struct arch_hw_breakpoint *info = counter_arch_bp(bp);
unsigned long *dr7;
int i;
for (i = 0; i < HBP_NUM; i++) {
struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
if (!*slot) {
*slot = bp;
break;
}
}
if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
return -EBUSY;
set_debugreg(info->address, i);
__get_cpu_var(cpu_debugreg[i]) = info->address;
dr7 = &__get_cpu_var(dr7);
*dr7 |= encode_dr7(i, info->len, info->type);
set_debugreg(*dr7, 7);
return 0;
}
/*
* Uninstall the breakpoint contained in the given counter.
*
* First we search the debug address register it uses and then we disable
* it.
*
* Atomic: we hold the counter->ctx->lock and we only handle variables
* and registers local to this cpu.
*/
void arch_uninstall_hw_breakpoint(struct perf_event *bp)
{
struct arch_hw_breakpoint *info = counter_arch_bp(bp);
unsigned long *dr7;
int i;
for (i = 0; i < HBP_NUM; i++) {
struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
if (*slot == bp) {
*slot = NULL;
break;
}
}
if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
return;
dr7 = &__get_cpu_var(dr7);
*dr7 &= ~encode_dr7(i, info->len, info->type);
set_debugreg(*dr7, 7);
}
static int get_hbp_len(u8 hbp_len)
{
unsigned int len_in_bytes = 0;
switch (hbp_len) {
case X86_BREAKPOINT_LEN_1:
len_in_bytes = 1;
break;
case X86_BREAKPOINT_LEN_2:
len_in_bytes = 2;
break;
case X86_BREAKPOINT_LEN_4:
len_in_bytes = 4;
break;
#ifdef CONFIG_X86_64
case X86_BREAKPOINT_LEN_8:
len_in_bytes = 8;
break;
#endif
}
return len_in_bytes;
}
/*
* Check for virtual address in user space.
*/
int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
{
unsigned int len;
len = get_hbp_len(hbp_len);
return (va <= TASK_SIZE - len);
}
/*
* Check for virtual address in kernel space.
*/
static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
{
unsigned int len;
len = get_hbp_len(hbp_len);
return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
}
/*
* Store a breakpoint's encoded address, length, and type.
*/
static int arch_store_info(struct perf_event *bp)
{
struct arch_hw_breakpoint *info = counter_arch_bp(bp);
/*
* For kernel-addresses, either the address or symbol name can be
* specified.
*/
if (info->name)
info->address = (unsigned long)
kallsyms_lookup_name(info->name);
if (info->address)
return 0;
return -EINVAL;
}
int arch_bp_generic_fields(int x86_len, int x86_type,
int *gen_len, int *gen_type)
{
/* Len */
switch (x86_len) {
case X86_BREAKPOINT_LEN_1:
*gen_len = HW_BREAKPOINT_LEN_1;
break;
case X86_BREAKPOINT_LEN_2:
*gen_len = HW_BREAKPOINT_LEN_2;
break;
case X86_BREAKPOINT_LEN_4:
*gen_len = HW_BREAKPOINT_LEN_4;
break;
#ifdef CONFIG_X86_64
case X86_BREAKPOINT_LEN_8:
*gen_len = HW_BREAKPOINT_LEN_8;
break;
#endif
default:
return -EINVAL;
}
/* Type */
switch (x86_type) {
case X86_BREAKPOINT_EXECUTE:
*gen_type = HW_BREAKPOINT_X;
break;
case X86_BREAKPOINT_WRITE:
*gen_type = HW_BREAKPOINT_W;
break;
case X86_BREAKPOINT_RW:
*gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
break;
default:
return -EINVAL;
}
return 0;
}
static int arch_build_bp_info(struct perf_event *bp)
{
struct arch_hw_breakpoint *info = counter_arch_bp(bp);
info->address = bp->attr.bp_addr;
/* Len */
switch (bp->attr.bp_len) {
case HW_BREAKPOINT_LEN_1:
info->len = X86_BREAKPOINT_LEN_1;
break;
case HW_BREAKPOINT_LEN_2:
info->len = X86_BREAKPOINT_LEN_2;
break;
case HW_BREAKPOINT_LEN_4:
info->len = X86_BREAKPOINT_LEN_4;
break;
#ifdef CONFIG_X86_64
case HW_BREAKPOINT_LEN_8:
info->len = X86_BREAKPOINT_LEN_8;
break;
#endif
default:
return -EINVAL;
}
/* Type */
switch (bp->attr.bp_type) {
case HW_BREAKPOINT_W:
info->type = X86_BREAKPOINT_WRITE;
break;
case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
info->type = X86_BREAKPOINT_RW;
break;
case HW_BREAKPOINT_X:
info->type = X86_BREAKPOINT_EXECUTE;
break;
default:
return -EINVAL;
}
return 0;
}
/*
* Validate the arch-specific HW Breakpoint register settings
*/
int arch_validate_hwbkpt_settings(struct perf_event *bp,
struct task_struct *tsk)
{
struct arch_hw_breakpoint *info = counter_arch_bp(bp);
unsigned int align;
int ret;
ret = arch_build_bp_info(bp);
if (ret)
return ret;
ret = -EINVAL;
if (info->type == X86_BREAKPOINT_EXECUTE)
/*
* Ptrace-refactoring code
* For now, we'll allow instruction breakpoint only for user-space
* addresses
*/
if ((!arch_check_va_in_userspace(info->address, info->len)) &&
info->len != X86_BREAKPOINT_EXECUTE)
return ret;
switch (info->len) {
case X86_BREAKPOINT_LEN_1:
align = 0;
break;
case X86_BREAKPOINT_LEN_2:
align = 1;
break;
case X86_BREAKPOINT_LEN_4:
align = 3;
break;
#ifdef CONFIG_X86_64
case X86_BREAKPOINT_LEN_8:
align = 7;
break;
#endif
default:
return ret;
}
if (bp->callback)
ret = arch_store_info(bp);
if (ret < 0)
return ret;
/*
* Check that the low-order bits of the address are appropriate
* for the alignment implied by len.
*/
if (info->address & align)
return -EINVAL;
/* Check that the virtual address is in the proper range */
if (tsk) {
if (!arch_check_va_in_userspace(info->address, info->len))
return -EFAULT;
} else {
if (!arch_check_va_in_kernelspace(info->address, info->len))
return -EFAULT;
}
return 0;
}
/*
* Dump the debug register contents to the user.
* We can't dump our per cpu values because it
* may contain cpu wide breakpoint, something that
* doesn't belong to the current task.
*
* TODO: include non-ptrace user breakpoints (perf)
*/
void aout_dump_debugregs(struct user *dump)
{
int i;
int dr7 = 0;
struct perf_event *bp;
struct arch_hw_breakpoint *info;
struct thread_struct *thread = &current->thread;
for (i = 0; i < HBP_NUM; i++) {
bp = thread->ptrace_bps[i];
if (bp && !bp->attr.disabled) {
dump->u_debugreg[i] = bp->attr.bp_addr;
info = counter_arch_bp(bp);
dr7 |= encode_dr7(i, info->len, info->type);
} else {
dump->u_debugreg[i] = 0;
}
}
dump->u_debugreg[4] = 0;
dump->u_debugreg[5] = 0;
dump->u_debugreg[6] = current->thread.debugreg6;
dump->u_debugreg[7] = dr7;
}
EXPORT_SYMBOL_GPL(aout_dump_debugregs);
/*
* Release the user breakpoints used by ptrace
*/
void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
{
int i;
struct thread_struct *t = &tsk->thread;
for (i = 0; i < HBP_NUM; i++) {
unregister_hw_breakpoint(t->ptrace_bps[i]);
t->ptrace_bps[i] = NULL;
}
}
void hw_breakpoint_restore(void)
{
set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
set_debugreg(current->thread.debugreg6, 6);
set_debugreg(__get_cpu_var(dr7), 7);
}
EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
/*
* Handle debug exception notifications.
*
* Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
*
* NOTIFY_DONE returned if one of the following conditions is true.
* i) When the causative address is from user-space and the exception
* is a valid one, i.e. not triggered as a result of lazy debug register
* switching
* ii) When there are more bits than trap<n> set in DR6 register (such
* as BD, BS or BT) indicating that more than one debug condition is
* met and requires some more action in do_debug().
*
* NOTIFY_STOP returned for all other cases
*
*/
static int __kprobes hw_breakpoint_handler(struct die_args *args)
{
int i, cpu, rc = NOTIFY_STOP;
struct perf_event *bp;
unsigned long dr7, dr6;
unsigned long *dr6_p;
/* The DR6 value is pointed by args->err */
dr6_p = (unsigned long *)ERR_PTR(args->err);
dr6 = *dr6_p;
/* Do an early return if no trap bits are set in DR6 */
if ((dr6 & DR_TRAP_BITS) == 0)
return NOTIFY_DONE;
get_debugreg(dr7, 7);
/* Disable breakpoints during exception handling */
set_debugreg(0UL, 7);
/*
* Assert that local interrupts are disabled
* Reset the DRn bits in the virtualized register value.
* The ptrace trigger routine will add in whatever is needed.
*/
current->thread.debugreg6 &= ~DR_TRAP_BITS;
cpu = get_cpu();
/* Handle all the breakpoints that were triggered */
for (i = 0; i < HBP_NUM; ++i) {
if (likely(!(dr6 & (DR_TRAP0 << i))))
continue;
/*
* The counter may be concurrently released but that can only
* occur from a call_rcu() path. We can then safely fetch
* the breakpoint, use its callback, touch its counter
* while we are in an rcu_read_lock() path.
*/
rcu_read_lock();
bp = per_cpu(bp_per_reg[i], cpu);
if (bp)
rc = NOTIFY_DONE;
/*
* Reset the 'i'th TRAP bit in dr6 to denote completion of
* exception handling
*/
(*dr6_p) &= ~(DR_TRAP0 << i);
/*
* bp can be NULL due to lazy debug register switching
* or due to concurrent perf counter removing.
*/
if (!bp) {
rcu_read_unlock();
break;
}
(bp->callback)(bp, args->regs);
rcu_read_unlock();
}
if (dr6 & (~DR_TRAP_BITS))
rc = NOTIFY_DONE;
set_debugreg(dr7, 7);
put_cpu();
return rc;
}
/*
* Handle debug exception notifications.
*/
int __kprobes hw_breakpoint_exceptions_notify(
struct notifier_block *unused, unsigned long val, void *data)
{
if (val != DIE_DEBUG)
return NOTIFY_DONE;
return hw_breakpoint_handler(data);
}
void hw_breakpoint_pmu_read(struct perf_event *bp)
{
/* TODO */
}
void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
{
/* TODO */
}
...@@ -43,6 +43,7 @@ ...@@ -43,6 +43,7 @@
#include <linux/smp.h> #include <linux/smp.h>
#include <linux/nmi.h> #include <linux/nmi.h>
#include <asm/debugreg.h>
#include <asm/apicdef.h> #include <asm/apicdef.h>
#include <asm/system.h> #include <asm/system.h>
...@@ -434,6 +435,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args) ...@@ -434,6 +435,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
"resuming...\n"); "resuming...\n");
kgdb_arch_handle_exception(args->trapnr, args->signr, kgdb_arch_handle_exception(args->trapnr, args->signr,
args->err, "c", "", regs); args->err, "c", "", regs);
/*
* Reset the BS bit in dr6 (pointed by args->err) to
* denote completion of processing
*/
(*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
return NOTIFY_STOP; return NOTIFY_STOP;
} }
......
...@@ -56,6 +56,7 @@ ...@@ -56,6 +56,7 @@
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/alternative.h> #include <asm/alternative.h>
#include <asm/insn.h> #include <asm/insn.h>
#include <asm/debugreg.h>
void jprobe_return_end(void); void jprobe_return_end(void);
...@@ -945,8 +946,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, ...@@ -945,8 +946,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
ret = NOTIFY_STOP; ret = NOTIFY_STOP;
break; break;
case DIE_DEBUG: case DIE_DEBUG:
if (post_kprobe_handler(args->regs)) if (post_kprobe_handler(args->regs)) {
/*
* Reset the BS bit in dr6 (pointed by args->err) to
* denote completion of processing
*/
(*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
ret = NOTIFY_STOP; ret = NOTIFY_STOP;
}
break; break;
case DIE_GPF: case DIE_GPF:
/* /*
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include <asm/desc.h> #include <asm/desc.h>
#include <asm/system.h> #include <asm/system.h>
#include <asm/cacheflush.h> #include <asm/cacheflush.h>
#include <asm/debugreg.h>
static void set_idt(void *newidt, __u16 limit) static void set_idt(void *newidt, __u16 limit)
{ {
...@@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image) ...@@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image)
/* Interrupts aren't acceptable while we reboot */ /* Interrupts aren't acceptable while we reboot */
local_irq_disable(); local_irq_disable();
hw_breakpoint_disable();
if (image->preserve_context) { if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC #ifdef CONFIG_X86_IO_APIC
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <asm/pgtable.h> #include <asm/pgtable.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/mmu_context.h> #include <asm/mmu_context.h>
#include <asm/debugreg.h>
static int init_one_level2_page(struct kimage *image, pgd_t *pgd, static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
unsigned long addr) unsigned long addr)
...@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image) ...@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
/* Interrupts aren't acceptable while we reboot */ /* Interrupts aren't acceptable while we reboot */
local_irq_disable(); local_irq_disable();
hw_breakpoint_disable();
if (image->preserve_context) { if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC #ifdef CONFIG_X86_IO_APIC
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <linux/clockchips.h> #include <linux/clockchips.h>
#include <linux/random.h> #include <linux/random.h>
#include <trace/events/power.h> #include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
#include <asm/system.h> #include <asm/system.h>
#include <asm/apic.h> #include <asm/apic.h>
#include <asm/syscalls.h> #include <asm/syscalls.h>
...@@ -17,6 +18,7 @@ ...@@ -17,6 +18,7 @@
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/i387.h> #include <asm/i387.h>
#include <asm/ds.h> #include <asm/ds.h>
#include <asm/debugreg.h>
unsigned long idle_halt; unsigned long idle_halt;
EXPORT_SYMBOL(idle_halt); EXPORT_SYMBOL(idle_halt);
...@@ -103,14 +105,7 @@ void flush_thread(void) ...@@ -103,14 +105,7 @@ void flush_thread(void)
} }
#endif #endif
clear_tsk_thread_flag(tsk, TIF_DEBUG); flush_ptrace_hw_breakpoint(tsk);
tsk->thread.debugreg0 = 0;
tsk->thread.debugreg1 = 0;
tsk->thread.debugreg2 = 0;
tsk->thread.debugreg3 = 0;
tsk->thread.debugreg6 = 0;
tsk->thread.debugreg7 = 0;
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
/* /*
* Forget coprocessor state.. * Forget coprocessor state..
...@@ -192,16 +187,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, ...@@ -192,16 +187,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
else if (next->debugctlmsr != prev->debugctlmsr) else if (next->debugctlmsr != prev->debugctlmsr)
update_debugctlmsr(next->debugctlmsr); update_debugctlmsr(next->debugctlmsr);
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
set_debugreg(next->debugreg0, 0);
set_debugreg(next->debugreg1, 1);
set_debugreg(next->debugreg2, 2);
set_debugreg(next->debugreg3, 3);
/* no 4 and 5 */
set_debugreg(next->debugreg6, 6);
set_debugreg(next->debugreg7, 7);
}
if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
test_tsk_thread_flag(next_p, TIF_NOTSC)) { test_tsk_thread_flag(next_p, TIF_NOTSC)) {
/* prev and next are different */ /* prev and next are different */
......
...@@ -58,6 +58,7 @@ ...@@ -58,6 +58,7 @@
#include <asm/idle.h> #include <asm/idle.h>
#include <asm/syscalls.h> #include <asm/syscalls.h>
#include <asm/ds.h> #include <asm/ds.h>
#include <asm/debugreg.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
...@@ -259,7 +260,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, ...@@ -259,7 +260,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
task_user_gs(p) = get_user_gs(regs); task_user_gs(p) = get_user_gs(regs);
p->thread.io_bitmap_ptr = NULL;
tsk = current; tsk = current;
err = -ENOMEM;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL); IO_BITMAP_BYTES, GFP_KERNEL);
......
...@@ -52,6 +52,7 @@ ...@@ -52,6 +52,7 @@
#include <asm/idle.h> #include <asm/idle.h>
#include <asm/syscalls.h> #include <asm/syscalls.h>
#include <asm/ds.h> #include <asm/ds.h>
#include <asm/debugreg.h>
asmlinkage extern void ret_from_fork(void); asmlinkage extern void ret_from_fork(void);
...@@ -297,12 +298,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, ...@@ -297,12 +298,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.fs = me->thread.fs; p->thread.fs = me->thread.fs;
p->thread.gs = me->thread.gs; p->thread.gs = me->thread.gs;
p->thread.io_bitmap_ptr = NULL;
savesegment(gs, p->thread.gsindex); savesegment(gs, p->thread.gsindex);
savesegment(fs, p->thread.fsindex); savesegment(fs, p->thread.fsindex);
savesegment(es, p->thread.es); savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds); savesegment(ds, p->thread.ds);
err = -ENOMEM;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) { if (!p->thread.io_bitmap_ptr) {
...@@ -341,6 +346,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, ...@@ -341,6 +346,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
kfree(p->thread.io_bitmap_ptr); kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0; p->thread.io_bitmap_max = 0;
} }
return err; return err;
} }
...@@ -495,6 +501,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) ...@@ -495,6 +501,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/ */
if (preload_fpu) if (preload_fpu)
__math_state_restore(); __math_state_restore();
return prev_p; return prev_p;
} }
......
...@@ -22,6 +22,8 @@ ...@@ -22,6 +22,8 @@
#include <linux/seccomp.h> #include <linux/seccomp.h>
#include <linux/signal.h> #include <linux/signal.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
...@@ -34,6 +36,7 @@ ...@@ -34,6 +36,7 @@
#include <asm/prctl.h> #include <asm/prctl.h>
#include <asm/proto.h> #include <asm/proto.h>
#include <asm/ds.h> #include <asm/ds.h>
#include <asm/hw_breakpoint.h>
#include "tls.h" #include "tls.h"
...@@ -249,11 +252,6 @@ static int set_segment_reg(struct task_struct *task, ...@@ -249,11 +252,6 @@ static int set_segment_reg(struct task_struct *task,
return 0; return 0;
} }
static unsigned long debugreg_addr_limit(struct task_struct *task)
{
return TASK_SIZE - 3;
}
#else /* CONFIG_X86_64 */ #else /* CONFIG_X86_64 */
#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) #define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
...@@ -378,15 +376,6 @@ static int set_segment_reg(struct task_struct *task, ...@@ -378,15 +376,6 @@ static int set_segment_reg(struct task_struct *task,
return 0; return 0;
} }
static unsigned long debugreg_addr_limit(struct task_struct *task)
{
#ifdef CONFIG_IA32_EMULATION
if (test_tsk_thread_flag(task, TIF_IA32))
return IA32_PAGE_OFFSET - 3;
#endif
return TASK_SIZE_MAX - 7;
}
#endif /* CONFIG_X86_32 */ #endif /* CONFIG_X86_32 */
static unsigned long get_flags(struct task_struct *task) static unsigned long get_flags(struct task_struct *task)
...@@ -566,98 +555,228 @@ static int genregs_set(struct task_struct *target, ...@@ -566,98 +555,228 @@ static int genregs_set(struct task_struct *target,
return ret; return ret;
} }
static void ptrace_triggered(struct perf_event *bp, void *data)
{
int i;
struct thread_struct *thread = &(current->thread);
/*
* Store in the virtual DR6 register the fact that the breakpoint
* was hit so the thread's debugger will see it.
*/
for (i = 0; i < HBP_NUM; i++) {
if (thread->ptrace_bps[i] == bp)
break;
}
thread->debugreg6 |= (DR_TRAP0 << i);
}
/* /*
* This function is trivial and will be inlined by the compiler. * Walk through every ptrace breakpoints for this thread and
* Having it separates the implementation details of debug * build the dr7 value on top of their attributes.
* registers from the interface details of ptrace. *
*/ */
static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) static unsigned long ptrace_get_dr7(struct perf_event *bp[])
{ {
switch (n) { int i;
case 0: return child->thread.debugreg0; int dr7 = 0;
case 1: return child->thread.debugreg1; struct arch_hw_breakpoint *info;
case 2: return child->thread.debugreg2;
case 3: return child->thread.debugreg3; for (i = 0; i < HBP_NUM; i++) {
case 6: return child->thread.debugreg6; if (bp[i] && !bp[i]->attr.disabled) {
case 7: return child->thread.debugreg7; info = counter_arch_bp(bp[i]);
dr7 |= encode_dr7(i, info->len, info->type);
} }
return 0; }
return dr7;
} }
static int ptrace_set_debugreg(struct task_struct *child, /*
int n, unsigned long data) * Handle ptrace writes to debug register 7.
*/
static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
{ {
int i; struct thread_struct *thread = &(tsk->thread);
unsigned long old_dr7;
int i, orig_ret = 0, rc = 0;
int enabled, second_pass = 0;
unsigned len, type;
int gen_len, gen_type;
struct perf_event *bp;
if (unlikely(n == 4 || n == 5)) data &= ~DR_CONTROL_RESERVED;
return -EIO; old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
restore:
/*
* Loop through all the hardware breakpoints, making the
* appropriate changes to each.
*/
for (i = 0; i < HBP_NUM; i++) {
enabled = decode_dr7(data, i, &len, &type);
bp = thread->ptrace_bps[i];
if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) if (!enabled) {
return -EIO; if (bp) {
/*
* Don't unregister the breakpoints right-away,
* unless all register_user_hw_breakpoint()
* requests have succeeded. This prevents
* any window of opportunity for debug
* register grabbing by other users.
*/
if (!second_pass)
continue;
thread->ptrace_bps[i] = NULL;
unregister_hw_breakpoint(bp);
}
continue;
}
switch (n) { /*
case 0: child->thread.debugreg0 = data; break; * We shoud have at least an inactive breakpoint at this
case 1: child->thread.debugreg1 = data; break; * slot. It means the user is writing dr7 without having
case 2: child->thread.debugreg2 = data; break; * written the address register first
case 3: child->thread.debugreg3 = data; break; */
if (!bp) {
rc = -EINVAL;
break;
}
case 6: rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
if ((data & ~0xffffffffUL) != 0) if (rc)
return -EIO;
child->thread.debugreg6 = data;
break; break;
case 7:
/* /*
* Sanity-check data. Take one half-byte at once with * This is a temporary thing as bp is unregistered/registered
* check = (val >> (16 + 4*i)) & 0xf. It contains the * to simulate modification
* R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
* 2 and 3 are LENi. Given a list of invalid values,
* we do mask |= 1 << invalid_value, so that
* (mask >> check) & 1 is a correct test for invalid
* values.
*
* R/Wi contains the type of the breakpoint /
* watchpoint, LENi contains the length of the watched
* data in the watchpoint case.
*
* The invalid values are:
* - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
* - R/Wi == 0x10 (break on I/O reads or writes), so
* mask |= 0x4444.
* - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
* 0x1110.
*
* Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
*
* See the Intel Manual "System Programming Guide",
* 15.2.4
*
* Note that LENi == 0x10 is defined on x86_64 in long
* mode (i.e. even for 32-bit userspace software, but
* 64-bit kernel), so the x86_64 mask value is 0x5454.
* See the AMD manual no. 24593 (AMD64 System Programming)
*/ */
#ifdef CONFIG_X86_32 bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len,
#define DR7_MASK 0x5f54 gen_type, bp->callback,
#else tsk, true);
#define DR7_MASK 0x5554 thread->ptrace_bps[i] = NULL;
#endif
data &= ~DR_CONTROL_RESERVED; if (!bp) { /* incorrect bp, or we have a bug in bp API */
for (i = 0; i < 4; i++) rc = -EINVAL;
if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) break;
return -EIO; }
child->thread.debugreg7 = data; if (IS_ERR(bp)) {
if (data) rc = PTR_ERR(bp);
set_tsk_thread_flag(child, TIF_DEBUG); bp = NULL;
else
clear_tsk_thread_flag(child, TIF_DEBUG);
break; break;
} }
thread->ptrace_bps[i] = bp;
}
/*
* Make a second pass to free the remaining unused breakpoints
* or to restore the original breakpoints if an error occurred.
*/
if (!second_pass) {
second_pass = 1;
if (rc < 0) {
orig_ret = rc;
data = old_dr7;
}
goto restore;
}
return ((orig_ret < 0) ? orig_ret : rc);
}
/*
* Handle PTRACE_PEEKUSR calls for the debug register area.
*/
static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
{
struct thread_struct *thread = &(tsk->thread);
unsigned long val = 0;
if (n < HBP_NUM) {
struct perf_event *bp;
bp = thread->ptrace_bps[n];
if (!bp)
return 0;
val = bp->hw.info.address;
} else if (n == 6) {
val = thread->debugreg6;
} else if (n == 7) {
val = ptrace_get_dr7(thread->ptrace_bps);
}
return val;
}
static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
unsigned long addr)
{
struct perf_event *bp;
struct thread_struct *t = &tsk->thread;
if (!t->ptrace_bps[nr]) {
/*
* Put stub len and type to register (reserve) an inactive but
* correct bp
*/
bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1,
HW_BREAKPOINT_W,
ptrace_triggered, tsk,
false);
} else {
bp = t->ptrace_bps[nr];
t->ptrace_bps[nr] = NULL;
bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len,
bp->attr.bp_type,
bp->callback,
tsk,
bp->attr.disabled);
}
if (!bp)
return -EIO;
/*
* CHECKME: the previous code returned -EIO if the addr wasn't a
* valid task virtual addr. The new one will return -EINVAL in this
* case.
* -EINVAL may be what we want for in-kernel breakpoints users, but
* -EIO looks better for ptrace, since we refuse a register writing
* for the user. And anyway this is the previous behaviour.
*/
if (IS_ERR(bp))
return PTR_ERR(bp);
t->ptrace_bps[nr] = bp;
return 0; return 0;
} }
/*
* Handle PTRACE_POKEUSR calls for the debug register area.
*/
int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
{
struct thread_struct *thread = &(tsk->thread);
int rc = 0;
/* There are no DR4 or DR5 registers */
if (n == 4 || n == 5)
return -EIO;
if (n == 6) {
thread->debugreg6 = val;
goto ret_path;
}
if (n < HBP_NUM) {
rc = ptrace_set_breakpoint_addr(tsk, n, val);
if (rc)
return rc;
}
/* All that's left is DR7 */
if (n == 7)
rc = ptrace_write_dr7(tsk, val);
ret_path:
return rc;
}
/* /*
* These access the current or another (stopped) task's io permission * These access the current or another (stopped) task's io permission
* bitmap for debugging or core dump. * bitmap for debugging or core dump.
......
...@@ -799,15 +799,6 @@ static void do_signal(struct pt_regs *regs) ...@@ -799,15 +799,6 @@ static void do_signal(struct pt_regs *regs)
signr = get_signal_to_deliver(&info, &ka, regs, NULL); signr = get_signal_to_deliver(&info, &ka, regs, NULL);
if (signr > 0) { if (signr > 0) {
/*
* Re-enable any watchpoints before delivering the
* signal to user space. The processor register will
* have been cleared if the watchpoint triggered
* inside the kernel.
*/
if (current->thread.debugreg7)
set_debugreg(current->thread.debugreg7, 7);
/* Whee! Actually deliver the signal. */ /* Whee! Actually deliver the signal. */
if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
/* /*
......
...@@ -529,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) ...@@ -529,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
{ {
struct task_struct *tsk = current; struct task_struct *tsk = current;
unsigned long condition; unsigned long dr6;
int si_code; int si_code;
get_debugreg(condition, 6); get_debugreg(dr6, 6);
/* Catch kmemcheck conditions first of all! */ /* Catch kmemcheck conditions first of all! */
if (condition & DR_STEP && kmemcheck_trap(regs)) if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
return; return;
/* DR6 may or may not be cleared by the CPU */
set_debugreg(0, 6);
/* /*
* The processor cleared BTF, so don't mark that we need it set. * The processor cleared BTF, so don't mark that we need it set.
*/ */
clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
tsk->thread.debugctlmsr = 0; tsk->thread.debugctlmsr = 0;
if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, /* Store the virtualized DR6 value */
tsk->thread.debugreg6 = dr6;
if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
SIGTRAP) == NOTIFY_STOP) SIGTRAP) == NOTIFY_STOP)
return; return;
/* It's safe to allow irq's after DR6 has been saved */ /* It's safe to allow irq's after DR6 has been saved */
preempt_conditional_sti(regs); preempt_conditional_sti(regs);
/* Mask out spurious debug traps due to lazy DR7 setting */ if (regs->flags & X86_VM_MASK) {
if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { handle_vm86_trap((struct kernel_vm86_regs *) regs,
if (!tsk->thread.debugreg7) error_code, 1);
goto clear_dr7; return;
} }
#ifdef CONFIG_X86_32
if (regs->flags & X86_VM_MASK)
goto debug_vm86;
#endif
/* Save debug status register where ptrace can see it */
tsk->thread.debugreg6 = condition;
/* /*
* Single-stepping through TF: make sure we ignore any events in * Single-stepping through system calls: ignore any exceptions in
* kernel space (but re-enable TF when returning to user mode). * kernel space, but re-enable TF when returning to user mode.
*
* We already checked v86 mode above, so we can check for kernel mode
* by just checking the CPL of CS.
*/ */
if (condition & DR_STEP) { if ((dr6 & DR_STEP) && !user_mode(regs)) {
if (!user_mode(regs)) tsk->thread.debugreg6 &= ~DR_STEP;
goto clear_TF_reenable; set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
regs->flags &= ~X86_EFLAGS_TF;
} }
si_code = get_si_code(tsk->thread.debugreg6);
si_code = get_si_code(condition); if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
/* Ok, finally something we can handle */
send_sigtrap(tsk, regs, error_code, si_code); send_sigtrap(tsk, regs, error_code, si_code);
/*
* Disable additional traps. They'll be re-enabled when
* the signal is delivered.
*/
clear_dr7:
set_debugreg(0, 7);
preempt_conditional_cli(regs); preempt_conditional_cli(regs);
return;
#ifdef CONFIG_X86_32
debug_vm86:
/* reenable preemption: handle_vm86_trap() might sleep */
dec_preempt_count();
handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
conditional_cli(regs);
return;
#endif
clear_TF_reenable:
set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
regs->flags &= ~X86_EFLAGS_TF;
preempt_conditional_cli(regs);
return; return;
} }
......
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include "trace.h" #include "trace.h"
#include <asm/debugreg.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/msr.h> #include <asm/msr.h>
#include <asm/desc.h> #include <asm/desc.h>
...@@ -3643,14 +3644,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -3643,14 +3644,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
trace_kvm_entry(vcpu->vcpu_id); trace_kvm_entry(vcpu->vcpu_id);
kvm_x86_ops->run(vcpu, kvm_run); kvm_x86_ops->run(vcpu, kvm_run);
if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { /*
set_debugreg(current->thread.debugreg0, 0); * If the guest has used debug registers, at least dr7
set_debugreg(current->thread.debugreg1, 1); * will be disabled while returning to the host.
set_debugreg(current->thread.debugreg2, 2); * If we don't have active breakpoints in the host, we don't
set_debugreg(current->thread.debugreg3, 3); * care about the messed up debug address registers. But if
set_debugreg(current->thread.debugreg6, 6); * we have some of them active, restore the old state.
set_debugreg(current->thread.debugreg7, 7); */
} if (hw_breakpoint_active())
hw_breakpoint_restore();
set_bit(KVM_REQ_KICK, &vcpu->requests); set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable(); local_irq_enable();
......
...@@ -540,8 +540,14 @@ kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) ...@@ -540,8 +540,14 @@ kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
struct die_args *arg = args; struct die_args *arg = args;
if (val == DIE_DEBUG && (arg->err & DR_STEP)) if (val == DIE_DEBUG && (arg->err & DR_STEP))
if (post_kmmio_handler(arg->err, arg->regs) == 1) if (post_kmmio_handler(arg->err, arg->regs) == 1) {
/*
* Reset the BS bit in dr6 (pointed by args->err) to
* denote completion of processing
*/
(*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP;
return NOTIFY_STOP; return NOTIFY_STOP;
}
return NOTIFY_DONE; return NOTIFY_DONE;
} }
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <asm/mce.h> #include <asm/mce.h>
#include <asm/xcr.h> #include <asm/xcr.h>
#include <asm/suspend.h> #include <asm/suspend.h>
#include <asm/debugreg.h>
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
static struct saved_context saved_context; static struct saved_context saved_context;
...@@ -142,31 +143,6 @@ static void fix_processor_context(void) ...@@ -142,31 +143,6 @@ static void fix_processor_context(void)
#endif #endif
load_TR_desc(); /* This does ltr */ load_TR_desc(); /* This does ltr */
load_LDT(&current->active_mm->context); /* This does lldt */ load_LDT(&current->active_mm->context); /* This does lldt */
/*
* Now maybe reload the debug registers
*/
if (current->thread.debugreg7) {
#ifdef CONFIG_X86_32
set_debugreg(current->thread.debugreg0, 0);
set_debugreg(current->thread.debugreg1, 1);
set_debugreg(current->thread.debugreg2, 2);
set_debugreg(current->thread.debugreg3, 3);
/* no 4 and 5 */
set_debugreg(current->thread.debugreg6, 6);
set_debugreg(current->thread.debugreg7, 7);
#else
/* CONFIG_X86_64 */
loaddebug(&current->thread, 0);
loaddebug(&current->thread, 1);
loaddebug(&current->thread, 2);
loaddebug(&current->thread, 3);
/* no 4 and 5 */
loaddebug(&current->thread, 6);
loaddebug(&current->thread, 7);
#endif
}
} }
/** /**
......
#ifndef _LINUX_HW_BREAKPOINT_H
#define _LINUX_HW_BREAKPOINT_H
#include <linux/perf_event.h>
enum {
HW_BREAKPOINT_LEN_1 = 1,
HW_BREAKPOINT_LEN_2 = 2,
HW_BREAKPOINT_LEN_4 = 4,
HW_BREAKPOINT_LEN_8 = 8,
};
enum {
HW_BREAKPOINT_R = 1,
HW_BREAKPOINT_W = 2,
HW_BREAKPOINT_X = 4,
};
#ifdef CONFIG_HAVE_HW_BREAKPOINT
static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
{
return bp->attr.bp_addr;
}
static inline int hw_breakpoint_type(struct perf_event *bp)
{
return bp->attr.bp_type;
}
static inline int hw_breakpoint_len(struct perf_event *bp)
{
return bp->attr.bp_len;
}
extern struct perf_event *
register_user_hw_breakpoint(unsigned long addr,
int len,
int type,
perf_callback_t triggered,
struct task_struct *tsk,
bool active);
/* FIXME: only change from the attr, and don't unregister */
extern struct perf_event *
modify_user_hw_breakpoint(struct perf_event *bp,
unsigned long addr,
int len,
int type,
perf_callback_t triggered,
struct task_struct *tsk,
bool active);
/*
* Kernel breakpoints are not associated with any particular thread.
*/
extern struct perf_event *
register_wide_hw_breakpoint_cpu(unsigned long addr,
int len,
int type,
perf_callback_t triggered,
int cpu,
bool active);
extern struct perf_event **
register_wide_hw_breakpoint(unsigned long addr,
int len,
int type,
perf_callback_t triggered,
bool active);
extern int register_perf_hw_breakpoint(struct perf_event *bp);
extern int __register_perf_hw_breakpoint(struct perf_event *bp);
extern void unregister_hw_breakpoint(struct perf_event *bp);
extern void unregister_wide_hw_breakpoint(struct perf_event **cpu_events);
extern int reserve_bp_slot(struct perf_event *bp);
extern void release_bp_slot(struct perf_event *bp);
extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk);
static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
{
return &bp->hw.info;
}
#else /* !CONFIG_HAVE_HW_BREAKPOINT */
static inline struct perf_event *
register_user_hw_breakpoint(unsigned long addr,
int len,
int type,
perf_callback_t triggered,
struct task_struct *tsk,
bool active) { return NULL; }
static inline struct perf_event *
modify_user_hw_breakpoint(struct perf_event *bp,
unsigned long addr,
int len,
int type,
perf_callback_t triggered,
struct task_struct *tsk,
bool active) { return NULL; }
static inline struct perf_event *
register_wide_hw_breakpoint_cpu(unsigned long addr,
int len,
int type,
perf_callback_t triggered,
int cpu,
bool active) { return NULL; }
static inline struct perf_event **
register_wide_hw_breakpoint(unsigned long addr,
int len,
int type,
perf_callback_t triggered,
bool active) { return NULL; }
static inline int
register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; }
static inline int
__register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; }
static inline void unregister_hw_breakpoint(struct perf_event *bp) { }
static inline void
unregister_wide_hw_breakpoint(struct perf_event **cpu_events) { }
static inline int
reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; }
static inline void release_bp_slot(struct perf_event *bp) { }
static inline void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { }
static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
{
return NULL;
}
#endif /* CONFIG_HAVE_HW_BREAKPOINT */
#endif /* _LINUX_HW_BREAKPOINT_H */
...@@ -18,6 +18,10 @@ ...@@ -18,6 +18,10 @@
#include <linux/ioctl.h> #include <linux/ioctl.h>
#include <asm/byteorder.h> #include <asm/byteorder.h>
#ifdef CONFIG_HAVE_HW_BREAKPOINT
#include <asm/hw_breakpoint.h>
#endif
/* /*
* User-space ABI bits: * User-space ABI bits:
*/ */
...@@ -31,6 +35,7 @@ enum perf_type_id { ...@@ -31,6 +35,7 @@ enum perf_type_id {
PERF_TYPE_TRACEPOINT = 2, PERF_TYPE_TRACEPOINT = 2,
PERF_TYPE_HW_CACHE = 3, PERF_TYPE_HW_CACHE = 3,
PERF_TYPE_RAW = 4, PERF_TYPE_RAW = 4,
PERF_TYPE_BREAKPOINT = 5,
PERF_TYPE_MAX, /* non-ABI */ PERF_TYPE_MAX, /* non-ABI */
}; };
...@@ -209,6 +214,15 @@ struct perf_event_attr { ...@@ -209,6 +214,15 @@ struct perf_event_attr {
__u32 wakeup_events; /* wakeup every n events */ __u32 wakeup_events; /* wakeup every n events */
__u32 wakeup_watermark; /* bytes before wakeup */ __u32 wakeup_watermark; /* bytes before wakeup */
}; };
union {
struct { /* Hardware breakpoint info */
__u64 bp_addr;
__u32 bp_type;
__u32 bp_len;
};
};
__u32 __reserved_2; __u32 __reserved_2;
__u64 __reserved_3; __u64 __reserved_3;
...@@ -478,6 +492,11 @@ struct hw_perf_event { ...@@ -478,6 +492,11 @@ struct hw_perf_event {
s64 remaining; s64 remaining;
struct hrtimer hrtimer; struct hrtimer hrtimer;
}; };
#ifdef CONFIG_HAVE_HW_BREAKPOINT
union { /* breakpoint */
struct arch_hw_breakpoint info;
};
#endif
}; };
atomic64_t prev_count; atomic64_t prev_count;
u64 sample_period; u64 sample_period;
...@@ -546,6 +565,8 @@ struct perf_pending_entry { ...@@ -546,6 +565,8 @@ struct perf_pending_entry {
void (*func)(struct perf_pending_entry *); void (*func)(struct perf_pending_entry *);
}; };
typedef void (*perf_callback_t)(struct perf_event *, void *);
/** /**
* struct perf_event - performance event kernel representation: * struct perf_event - performance event kernel representation:
*/ */
...@@ -641,6 +662,10 @@ struct perf_event { ...@@ -641,6 +662,10 @@ struct perf_event {
struct event_filter *filter; struct event_filter *filter;
#endif #endif
perf_callback_t callback;
perf_callback_t event_callback;
#endif /* CONFIG_PERF_EVENTS */ #endif /* CONFIG_PERF_EVENTS */
}; };
...@@ -745,6 +770,13 @@ extern int hw_perf_group_sched_in(struct perf_event *group_leader, ...@@ -745,6 +770,13 @@ extern int hw_perf_group_sched_in(struct perf_event *group_leader,
struct perf_cpu_context *cpuctx, struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx, int cpu); struct perf_event_context *ctx, int cpu);
extern void perf_event_update_userpage(struct perf_event *event); extern void perf_event_update_userpage(struct perf_event *event);
extern int perf_event_release_kernel(struct perf_event *event);
extern struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr,
int cpu,
pid_t pid,
perf_callback_t callback);
extern u64 perf_event_read_value(struct perf_event *event);
struct perf_sample_data { struct perf_sample_data {
u64 type; u64 type;
...@@ -821,6 +853,7 @@ extern int sysctl_perf_event_sample_rate; ...@@ -821,6 +853,7 @@ extern int sysctl_perf_event_sample_rate;
extern void perf_event_init(void); extern void perf_event_init(void);
extern void perf_tp_event(int event_id, u64 addr, u64 count, extern void perf_tp_event(int event_id, u64 addr, u64 count,
void *record, int entry_size); void *record, int entry_size);
extern void perf_bp_event(struct perf_event *event, void *data);
#ifndef perf_misc_flags #ifndef perf_misc_flags
#define perf_misc_flags(regs) (user_mode(regs) ? PERF_RECORD_MISC_USER : \ #define perf_misc_flags(regs) (user_mode(regs) ? PERF_RECORD_MISC_USER : \
...@@ -855,6 +888,8 @@ static inline int perf_event_task_enable(void) { return -EINVAL; } ...@@ -855,6 +888,8 @@ static inline int perf_event_task_enable(void) { return -EINVAL; }
static inline void static inline void
perf_sw_event(u32 event_id, u64 nr, int nmi, perf_sw_event(u32 event_id, u64 nr, int nmi,
struct pt_regs *regs, u64 addr) { } struct pt_regs *regs, u64 addr) { }
static inline void
perf_bp_event(struct perf_event *event, void *data) { }
static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { }
static inline void perf_event_comm(struct task_struct *tsk) { } static inline void perf_event_comm(struct task_struct *tsk) { }
......
...@@ -95,6 +95,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ ...@@ -95,6 +95,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_SLOW_WORK) += slow-work.o obj-$(CONFIG_SLOW_WORK) += slow-work.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
......
...@@ -49,6 +49,7 @@ ...@@ -49,6 +49,7 @@
#include <linux/init_task.h> #include <linux/init_task.h>
#include <linux/perf_event.h> #include <linux/perf_event.h>
#include <trace/events/sched.h> #include <trace/events/sched.h>
#include <linux/hw_breakpoint.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/unistd.h> #include <asm/unistd.h>
...@@ -977,6 +978,10 @@ NORET_TYPE void do_exit(long code) ...@@ -977,6 +978,10 @@ NORET_TYPE void do_exit(long code)
proc_exit_connector(tsk); proc_exit_connector(tsk);
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
flush_ptrace_hw_breakpoint(tsk);
/* /*
* Flush inherited counters to the parent - before the parent * Flush inherited counters to the parent - before the parent
* gets woken up by child-exit notifications. * gets woken up by child-exit notifications.
......
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) 2007 Alan Stern
* Copyright (C) IBM Corporation, 2009
* Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
*
* Thanks to Ingo Molnar for his many suggestions.
*/
/*
* HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
* using the CPU's debug registers.
* This file contains the arch-independent routines.
*/
#include <linux/irqflags.h>
#include <linux/kallsyms.h>
#include <linux/notifier.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/hw_breakpoint.h>
#include <asm/processor.h>
#ifdef CONFIG_X86
#include <asm/debugreg.h>
#endif
/*
* Constraints data
*/
/* Number of pinned cpu breakpoints in a cpu */
static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
/* Number of pinned task breakpoints in a cpu */
static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]);
/* Number of non-pinned cpu/task breakpoints in a cpu */
static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
/* Gather the number of total pinned and un-pinned bp in a cpuset */
struct bp_busy_slots {
unsigned int pinned;
unsigned int flexible;
};
/* Serialize accesses to the above constraints */
static DEFINE_MUTEX(nr_bp_mutex);
/*
* Report the maximum number of pinned breakpoints a task
* have in this cpu
*/
static unsigned int max_task_bp_pinned(int cpu)
{
int i;
unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu);
for (i = HBP_NUM -1; i >= 0; i--) {
if (tsk_pinned[i] > 0)
return i + 1;
}
return 0;
}
/*
* Report the number of pinned/un-pinned breakpoints we have in
* a given cpu (cpu > -1) or in all of them (cpu = -1).
*/
static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
{
if (cpu >= 0) {
slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
slots->pinned += max_task_bp_pinned(cpu);
slots->flexible = per_cpu(nr_bp_flexible, cpu);
return;
}
for_each_online_cpu(cpu) {
unsigned int nr;
nr = per_cpu(nr_cpu_bp_pinned, cpu);
nr += max_task_bp_pinned(cpu);
if (nr > slots->pinned)
slots->pinned = nr;
nr = per_cpu(nr_bp_flexible, cpu);
if (nr > slots->flexible)
slots->flexible = nr;
}
}
/*
* Add a pinned breakpoint for the given task in our constraint table
*/
static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
{
int count = 0;
struct perf_event *bp;
struct perf_event_context *ctx = tsk->perf_event_ctxp;
unsigned int *task_bp_pinned;
struct list_head *list;
unsigned long flags;
if (WARN_ONCE(!ctx, "No perf context for this task"))
return;
list = &ctx->event_list;
spin_lock_irqsave(&ctx->lock, flags);
/*
* The current breakpoint counter is not included in the list
* at the open() callback time
*/
list_for_each_entry(bp, list, event_entry) {
if (bp->attr.type == PERF_TYPE_BREAKPOINT)
count++;
}
spin_unlock_irqrestore(&ctx->lock, flags);
if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
return;
task_bp_pinned = per_cpu(task_bp_pinned, cpu);
if (enable) {
task_bp_pinned[count]++;
if (count > 0)
task_bp_pinned[count-1]--;
} else {
task_bp_pinned[count]--;
if (count > 0)
task_bp_pinned[count-1]++;
}
}
/*
* Add/remove the given breakpoint in our constraint table
*/
static void toggle_bp_slot(struct perf_event *bp, bool enable)
{
int cpu = bp->cpu;
struct task_struct *tsk = bp->ctx->task;
/* Pinned counter task profiling */
if (tsk) {
if (cpu >= 0) {
toggle_bp_task_slot(tsk, cpu, enable);
return;
}
for_each_online_cpu(cpu)
toggle_bp_task_slot(tsk, cpu, enable);
return;
}
/* Pinned counter cpu profiling */
if (enable)
per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
else
per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
}
/*
* Contraints to check before allowing this new breakpoint counter:
*
* == Non-pinned counter == (Considered as pinned for now)
*
* - If attached to a single cpu, check:
*
* (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
* + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM
*
* -> If there are already non-pinned counters in this cpu, it means
* there is already a free slot for them.
* Otherwise, we check that the maximum number of per task
* breakpoints (for this cpu) plus the number of per cpu breakpoint
* (for this cpu) doesn't cover every registers.
*
* - If attached to every cpus, check:
*
* (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
* + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM
*
* -> This is roughly the same, except we check the number of per cpu
* bp for every cpu and we keep the max one. Same for the per tasks
* breakpoints.
*
*
* == Pinned counter ==
*
* - If attached to a single cpu, check:
*
* ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
* + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM
*
* -> Same checks as before. But now the nr_bp_flexible, if any, must keep
* one register at least (or they will never be fed).
*
* - If attached to every cpus, check:
*
* ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
* + max(per_cpu(task_bp_pinned, *))) < HBP_NUM
*/
int reserve_bp_slot(struct perf_event *bp)
{
struct bp_busy_slots slots = {0};
int ret = 0;
mutex_lock(&nr_bp_mutex);
fetch_bp_busy_slots(&slots, bp->cpu);
/* Flexible counters need to keep at least one slot */
if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
ret = -ENOSPC;
goto end;
}
toggle_bp_slot(bp, true);
end:
mutex_unlock(&nr_bp_mutex);
return ret;
}
void release_bp_slot(struct perf_event *bp)
{
mutex_lock(&nr_bp_mutex);
toggle_bp_slot(bp, false);
mutex_unlock(&nr_bp_mutex);
}
int __register_perf_hw_breakpoint(struct perf_event *bp)
{
int ret;
ret = reserve_bp_slot(bp);
if (ret)
return ret;
if (!bp->attr.disabled)
ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
return ret;
}
int register_perf_hw_breakpoint(struct perf_event *bp)
{
bp->callback = perf_bp_event;
return __register_perf_hw_breakpoint(bp);
}
/*
* Register a breakpoint bound to a task and a given cpu.
* If cpu is -1, the breakpoint is active for the task in every cpu
* If the task is -1, the breakpoint is active for every tasks in the given
* cpu.
*/
static struct perf_event *
register_user_hw_breakpoint_cpu(unsigned long addr,
int len,
int type,
perf_callback_t triggered,
pid_t pid,
int cpu,
bool active)
{
struct perf_event_attr *attr;
struct perf_event *bp;
attr = kzalloc(sizeof(*attr), GFP_KERNEL);
if (!attr)
return ERR_PTR(-ENOMEM);
attr->type = PERF_TYPE_BREAKPOINT;
attr->size = sizeof(*attr);
attr->bp_addr = addr;
attr->bp_len = len;
attr->bp_type = type;
/*
* Such breakpoints are used by debuggers to trigger signals when
* we hit the excepted memory op. We can't miss such events, they
* must be pinned.
*/
attr->pinned = 1;
if (!active)
attr->disabled = 1;
bp = perf_event_create_kernel_counter(attr, cpu, pid, triggered);
kfree(attr);
return bp;
}
/**
* register_user_hw_breakpoint - register a hardware breakpoint for user space
* @addr: is the memory address that triggers the breakpoint
* @len: the length of the access to the memory (1 byte, 2 bytes etc...)
* @type: the type of the access to the memory (read/write/exec)
* @triggered: callback to trigger when we hit the breakpoint
* @tsk: pointer to 'task_struct' of the process to which the address belongs
* @active: should we activate it while registering it
*
*/
struct perf_event *
register_user_hw_breakpoint(unsigned long addr,
int len,
int type,
perf_callback_t triggered,
struct task_struct *tsk,
bool active)
{
return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
tsk->pid, -1, active);
}
EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
/**
* modify_user_hw_breakpoint - modify a user-space hardware breakpoint
* @bp: the breakpoint structure to modify
* @addr: is the memory address that triggers the breakpoint
* @len: the length of the access to the memory (1 byte, 2 bytes etc...)
* @type: the type of the access to the memory (read/write/exec)
* @triggered: callback to trigger when we hit the breakpoint
* @tsk: pointer to 'task_struct' of the process to which the address belongs
* @active: should we activate it while registering it
*/
struct perf_event *
modify_user_hw_breakpoint(struct perf_event *bp,
unsigned long addr,
int len,
int type,
perf_callback_t triggered,
struct task_struct *tsk,
bool active)
{
/*
* FIXME: do it without unregistering
* - We don't want to lose our slot
* - If the new bp is incorrect, don't lose the older one
*/
unregister_hw_breakpoint(bp);
return register_user_hw_breakpoint(addr, len, type, triggered,
tsk, active);
}
EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
/**
* unregister_hw_breakpoint - unregister a user-space hardware breakpoint
* @bp: the breakpoint structure to unregister
*/
void unregister_hw_breakpoint(struct perf_event *bp)
{
if (!bp)
return;
perf_event_release_kernel(bp);
}
EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
static struct perf_event *
register_kernel_hw_breakpoint_cpu(unsigned long addr,
int len,
int type,
perf_callback_t triggered,
int cpu,
bool active)
{
return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
-1, cpu, active);
}
/**
* register_wide_hw_breakpoint - register a wide breakpoint in the kernel
* @addr: is the memory address that triggers the breakpoint
* @len: the length of the access to the memory (1 byte, 2 bytes etc...)
* @type: the type of the access to the memory (read/write/exec)
* @triggered: callback to trigger when we hit the breakpoint
* @active: should we activate it while registering it
*
* @return a set of per_cpu pointers to perf events
*/
struct perf_event **
register_wide_hw_breakpoint(unsigned long addr,
int len,
int type,
perf_callback_t triggered,
bool active)
{
struct perf_event **cpu_events, **pevent, *bp;
long err;
int cpu;
cpu_events = alloc_percpu(typeof(*cpu_events));
if (!cpu_events)
return ERR_PTR(-ENOMEM);
for_each_possible_cpu(cpu) {
pevent = per_cpu_ptr(cpu_events, cpu);
bp = register_kernel_hw_breakpoint_cpu(addr, len, type,
triggered, cpu, active);
*pevent = bp;
if (IS_ERR(bp) || !bp) {
err = PTR_ERR(bp);
goto fail;
}
}
return cpu_events;
fail:
for_each_possible_cpu(cpu) {
pevent = per_cpu_ptr(cpu_events, cpu);
if (IS_ERR(*pevent) || !*pevent)
break;
unregister_hw_breakpoint(*pevent);
}
free_percpu(cpu_events);
/* return the error if any */
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
/**
* unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
* @cpu_events: the per cpu set of events to unregister
*/
void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
{
int cpu;
struct perf_event **pevent;
for_each_possible_cpu(cpu) {
pevent = per_cpu_ptr(cpu_events, cpu);
unregister_hw_breakpoint(*pevent);
}
free_percpu(cpu_events);
}
EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
static struct notifier_block hw_breakpoint_exceptions_nb = {
.notifier_call = hw_breakpoint_exceptions_notify,
/* we need to be notified first */
.priority = 0x7fffffff
};
static int __init init_hw_breakpoint(void)
{
return register_die_notifier(&hw_breakpoint_exceptions_nb);
}
core_initcall(init_hw_breakpoint);
struct pmu perf_ops_bp = {
.enable = arch_install_hw_breakpoint,
.disable = arch_uninstall_hw_breakpoint,
.read = hw_breakpoint_pmu_read,
.unthrottle = hw_breakpoint_pmu_unthrottle
};
...@@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name) ...@@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name)
} }
return module_kallsyms_lookup_name(name); return module_kallsyms_lookup_name(name);
} }
EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
unsigned long), unsigned long),
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include <linux/kernel_stat.h> #include <linux/kernel_stat.h>
#include <linux/perf_event.h> #include <linux/perf_event.h>
#include <linux/ftrace_event.h> #include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
#include <asm/irq_regs.h> #include <asm/irq_regs.h>
...@@ -1725,6 +1726,26 @@ static int perf_release(struct inode *inode, struct file *file) ...@@ -1725,6 +1726,26 @@ static int perf_release(struct inode *inode, struct file *file)
return 0; return 0;
} }
int perf_event_release_kernel(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
perf_event_remove_from_context(event);
mutex_unlock(&ctx->mutex);
mutex_lock(&event->owner->perf_event_mutex);
list_del_init(&event->owner_entry);
mutex_unlock(&event->owner->perf_event_mutex);
put_task_struct(event->owner);
free_event(event);
return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
static int perf_event_read_size(struct perf_event *event) static int perf_event_read_size(struct perf_event *event)
{ {
int entry = sizeof(u64); /* value */ int entry = sizeof(u64); /* value */
...@@ -1750,7 +1771,7 @@ static int perf_event_read_size(struct perf_event *event) ...@@ -1750,7 +1771,7 @@ static int perf_event_read_size(struct perf_event *event)
return size; return size;
} }
static u64 perf_event_read_value(struct perf_event *event) u64 perf_event_read_value(struct perf_event *event)
{ {
struct perf_event *child; struct perf_event *child;
u64 total = 0; u64 total = 0;
...@@ -1761,6 +1782,7 @@ static u64 perf_event_read_value(struct perf_event *event) ...@@ -1761,6 +1782,7 @@ static u64 perf_event_read_value(struct perf_event *event)
return total; return total;
} }
EXPORT_SYMBOL_GPL(perf_event_read_value);
static int perf_event_read_entry(struct perf_event *event, static int perf_event_read_entry(struct perf_event *event,
u64 read_format, char __user *buf) u64 read_format, char __user *buf)
...@@ -4231,6 +4253,51 @@ static void perf_event_free_filter(struct perf_event *event) ...@@ -4231,6 +4253,51 @@ static void perf_event_free_filter(struct perf_event *event)
#endif /* CONFIG_EVENT_PROFILE */ #endif /* CONFIG_EVENT_PROFILE */
#ifdef CONFIG_HAVE_HW_BREAKPOINT
static void bp_perf_event_destroy(struct perf_event *event)
{
release_bp_slot(event);
}
static const struct pmu *bp_perf_event_init(struct perf_event *bp)
{
int err;
/*
* The breakpoint is already filled if we haven't created the counter
* through perf syscall
* FIXME: manage to get trigerred to NULL if it comes from syscalls
*/
if (!bp->callback)
err = register_perf_hw_breakpoint(bp);
else
err = __register_perf_hw_breakpoint(bp);
if (err)
return ERR_PTR(err);
bp->destroy = bp_perf_event_destroy;
return &perf_ops_bp;
}
void perf_bp_event(struct perf_event *bp, void *regs)
{
/* TODO */
}
#else
static void bp_perf_event_destroy(struct perf_event *event)
{
}
static const struct pmu *bp_perf_event_init(struct perf_event *bp)
{
return NULL;
}
void perf_bp_event(struct perf_event *bp, void *regs)
{
}
#endif
atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
static void sw_perf_event_destroy(struct perf_event *event) static void sw_perf_event_destroy(struct perf_event *event)
...@@ -4297,6 +4364,7 @@ perf_event_alloc(struct perf_event_attr *attr, ...@@ -4297,6 +4364,7 @@ perf_event_alloc(struct perf_event_attr *attr,
struct perf_event_context *ctx, struct perf_event_context *ctx,
struct perf_event *group_leader, struct perf_event *group_leader,
struct perf_event *parent_event, struct perf_event *parent_event,
perf_callback_t callback,
gfp_t gfpflags) gfp_t gfpflags)
{ {
const struct pmu *pmu; const struct pmu *pmu;
...@@ -4339,6 +4407,11 @@ perf_event_alloc(struct perf_event_attr *attr, ...@@ -4339,6 +4407,11 @@ perf_event_alloc(struct perf_event_attr *attr,
event->state = PERF_EVENT_STATE_INACTIVE; event->state = PERF_EVENT_STATE_INACTIVE;
if (!callback && parent_event)
callback = parent_event->callback;
event->callback = callback;
if (attr->disabled) if (attr->disabled)
event->state = PERF_EVENT_STATE_OFF; event->state = PERF_EVENT_STATE_OFF;
...@@ -4373,6 +4446,11 @@ perf_event_alloc(struct perf_event_attr *attr, ...@@ -4373,6 +4446,11 @@ perf_event_alloc(struct perf_event_attr *attr,
pmu = tp_perf_event_init(event); pmu = tp_perf_event_init(event);
break; break;
case PERF_TYPE_BREAKPOINT:
pmu = bp_perf_event_init(event);
break;
default: default:
break; break;
} }
...@@ -4615,7 +4693,7 @@ SYSCALL_DEFINE5(perf_event_open, ...@@ -4615,7 +4693,7 @@ SYSCALL_DEFINE5(perf_event_open,
} }
event = perf_event_alloc(&attr, cpu, ctx, group_leader, event = perf_event_alloc(&attr, cpu, ctx, group_leader,
NULL, GFP_KERNEL); NULL, NULL, GFP_KERNEL);
err = PTR_ERR(event); err = PTR_ERR(event);
if (IS_ERR(event)) if (IS_ERR(event))
goto err_put_context; goto err_put_context;
...@@ -4663,6 +4741,58 @@ SYSCALL_DEFINE5(perf_event_open, ...@@ -4663,6 +4741,58 @@ SYSCALL_DEFINE5(perf_event_open,
return err; return err;
} }
/**
* perf_event_create_kernel_counter
*
* @attr: attributes of the counter to create
* @cpu: cpu in which the counter is bound
* @pid: task to profile
*/
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
pid_t pid, perf_callback_t callback)
{
struct perf_event *event;
struct perf_event_context *ctx;
int err;
/*
* Get the target context (task or percpu):
*/
ctx = find_get_context(pid, cpu);
if (IS_ERR(ctx))
return NULL;
event = perf_event_alloc(attr, cpu, ctx, NULL,
NULL, callback, GFP_KERNEL);
err = PTR_ERR(event);
if (IS_ERR(event))
goto err_put_context;
event->filp = NULL;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
perf_install_in_context(ctx, event, cpu);
++ctx->generation;
mutex_unlock(&ctx->mutex);
event->owner = current;
get_task_struct(current);
mutex_lock(&current->perf_event_mutex);
list_add_tail(&event->owner_entry, &current->perf_event_list);
mutex_unlock(&current->perf_event_mutex);
return event;
err_put_context:
if (err < 0)
put_ctx(ctx);
return NULL;
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
/* /*
* inherit a event from parent task to child task: * inherit a event from parent task to child task:
*/ */
...@@ -4688,7 +4818,7 @@ inherit_event(struct perf_event *parent_event, ...@@ -4688,7 +4818,7 @@ inherit_event(struct perf_event *parent_event,
child_event = perf_event_alloc(&parent_event->attr, child_event = perf_event_alloc(&parent_event->attr,
parent_event->cpu, child_ctx, parent_event->cpu, child_ctx,
group_leader, parent_event, group_leader, parent_event,
GFP_KERNEL); NULL, GFP_KERNEL);
if (IS_ERR(child_event)) if (IS_ERR(child_event))
return child_event; return child_event;
get_ctx(child_ctx); get_ctx(child_ctx);
......
...@@ -339,6 +339,27 @@ config POWER_TRACER ...@@ -339,6 +339,27 @@ config POWER_TRACER
power management decisions, specifically the C-state and P-state power management decisions, specifically the C-state and P-state
behavior. behavior.
config KSYM_TRACER
bool "Trace read and write access on kernel memory locations"
depends on HAVE_HW_BREAKPOINT
select TRACING
help
This tracer helps find read and write operations on any given kernel
symbol i.e. /proc/kallsyms.
config PROFILE_KSYM_TRACER
bool "Profile all kernel memory accesses on 'watched' variables"
depends on KSYM_TRACER
help
This tracer profiles kernel accesses on variables watched through the
ksym tracer ftrace plugin. Depending upon the hardware, all read
and write operations on kernel variables can be monitored for
accesses.
The results will be displayed in:
/debugfs/tracing/profile_ksym
Say N if unsure.
config STACK_TRACER config STACK_TRACER
bool "Trace max stack" bool "Trace max stack"
......
...@@ -54,6 +54,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o ...@@ -54,6 +54,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
obj-$(CONFIG_EVENT_TRACING) += power-traces.o obj-$(CONFIG_EVENT_TRACING) += power-traces.o
libftrace-y := ftrace.o libftrace-y := ftrace.o
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include <linux/ftrace.h> #include <linux/ftrace.h>
#include <trace/boot.h> #include <trace/boot.h>
#include <linux/kmemtrace.h> #include <linux/kmemtrace.h>
#include <linux/hw_breakpoint.h>
#include <linux/trace_seq.h> #include <linux/trace_seq.h>
#include <linux/ftrace_event.h> #include <linux/ftrace_event.h>
...@@ -37,6 +38,7 @@ enum trace_type { ...@@ -37,6 +38,7 @@ enum trace_type {
TRACE_KMEM_ALLOC, TRACE_KMEM_ALLOC,
TRACE_KMEM_FREE, TRACE_KMEM_FREE,
TRACE_BLK, TRACE_BLK,
TRACE_KSYM,
__TRACE_LAST_TYPE, __TRACE_LAST_TYPE,
}; };
...@@ -232,6 +234,7 @@ extern void __ftrace_bad_type(void); ...@@ -232,6 +234,7 @@ extern void __ftrace_bad_type(void);
TRACE_KMEM_ALLOC); \ TRACE_KMEM_ALLOC); \
IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
TRACE_KMEM_FREE); \ TRACE_KMEM_FREE); \
IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
__ftrace_bad_type(); \ __ftrace_bad_type(); \
} while (0) } while (0)
...@@ -387,6 +390,8 @@ int register_tracer(struct tracer *type); ...@@ -387,6 +390,8 @@ int register_tracer(struct tracer *type);
void unregister_tracer(struct tracer *type); void unregister_tracer(struct tracer *type);
int is_tracing_stopped(void); int is_tracing_stopped(void);
extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long nsecs_to_usecs(unsigned long nsecs);
#ifdef CONFIG_TRACER_MAX_TRACE #ifdef CONFIG_TRACER_MAX_TRACE
...@@ -461,6 +466,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace, ...@@ -461,6 +466,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
struct trace_array *tr); struct trace_array *tr);
extern int trace_selftest_startup_hw_branches(struct tracer *trace, extern int trace_selftest_startup_hw_branches(struct tracer *trace,
struct trace_array *tr); struct trace_array *tr);
extern int trace_selftest_startup_ksym(struct tracer *trace,
struct trace_array *tr);
#endif /* CONFIG_FTRACE_STARTUP_TEST */ #endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data); extern void *head_page(struct trace_array_cpu *data);
......
...@@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, ...@@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
F_printk("type:%u call_site:%lx ptr:%p", F_printk("type:%u call_site:%lx ptr:%p",
__entry->type_id, __entry->call_site, __entry->ptr) __entry->type_id, __entry->call_site, __entry->ptr)
); );
FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
TRACE_KSYM,
F_STRUCT(
__field( unsigned long, ip )
__field( unsigned char, type )
__array( char , cmd, TASK_COMM_LEN )
__field( unsigned long, addr )
),
F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
(void *)__entry->ip, (unsigned int)__entry->type,
(void *)__entry->addr, __entry->cmd)
);
/*
* trace_ksym.c - Kernel Symbol Tracer
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2009
*/
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/debugfs.h>
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/fs.h>
#include "trace_output.h"
#include "trace_stat.h"
#include "trace.h"
#include <linux/hw_breakpoint.h>
#include <asm/hw_breakpoint.h>
/*
* For now, let us restrict the no. of symbols traced simultaneously to number
* of available hardware breakpoint registers.
*/
#define KSYM_TRACER_MAX HBP_NUM
#define KSYM_TRACER_OP_LEN 3 /* rw- */
struct trace_ksym {
struct perf_event **ksym_hbp;
unsigned long ksym_addr;
int type;
int len;
#ifdef CONFIG_PROFILE_KSYM_TRACER
unsigned long counter;
#endif
struct hlist_node ksym_hlist;
};
static struct trace_array *ksym_trace_array;
static unsigned int ksym_filter_entry_count;
static unsigned int ksym_tracing_enabled;
static HLIST_HEAD(ksym_filter_head);
static DEFINE_MUTEX(ksym_tracer_mutex);
#ifdef CONFIG_PROFILE_KSYM_TRACER
#define MAX_UL_INT 0xffffffff
void ksym_collect_stats(unsigned long hbp_hit_addr)
{
struct hlist_node *node;
struct trace_ksym *entry;
rcu_read_lock();
hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
if ((entry->ksym_addr == hbp_hit_addr) &&
(entry->counter <= MAX_UL_INT)) {
entry->counter++;
break;
}
}
rcu_read_unlock();
}
#endif /* CONFIG_PROFILE_KSYM_TRACER */
void ksym_hbp_handler(struct perf_event *hbp, void *data)
{
struct ring_buffer_event *event;
struct ksym_trace_entry *entry;
struct pt_regs *regs = data;
struct ring_buffer *buffer;
int pc;
if (!ksym_tracing_enabled)
return;
buffer = ksym_trace_array->buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
sizeof(*entry), 0, pc);
if (!event)
return;
entry = ring_buffer_event_data(event);
entry->ip = instruction_pointer(regs);
entry->type = hw_breakpoint_type(hbp);
entry->addr = hw_breakpoint_addr(hbp);
strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
#ifdef CONFIG_PROFILE_KSYM_TRACER
ksym_collect_stats(hw_breakpoint_addr(hbp));
#endif /* CONFIG_PROFILE_KSYM_TRACER */
trace_buffer_unlock_commit(buffer, event, 0, pc);
}
/* Valid access types are represented as
*
* rw- : Set Read/Write Access Breakpoint
* -w- : Set Write Access Breakpoint
* --- : Clear Breakpoints
* --x : Set Execution Break points (Not available yet)
*
*/
static int ksym_trace_get_access_type(char *str)
{
int access = 0;
if (str[0] == 'r')
access |= HW_BREAKPOINT_R;
if (str[1] == 'w')
access |= HW_BREAKPOINT_W;
if (str[2] == 'x')
access |= HW_BREAKPOINT_X;
switch (access) {
case HW_BREAKPOINT_R:
case HW_BREAKPOINT_W:
case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
return access;
default:
return -EINVAL;
}
}
/*
* There can be several possible malformed requests and we attempt to capture
* all of them. We enumerate some of the rules
* 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
* i.e. multiple ':' symbols disallowed. Possible uses are of the form
* <module>:<ksym_name>:<op>.
* 2. No delimiter symbol ':' in the input string
* 3. Spurious operator symbols or symbols not in their respective positions
* 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
* 5. Kernel symbol not a part of /proc/kallsyms
* 6. Duplicate requests
*/
static int parse_ksym_trace_str(char *input_string, char **ksymname,
unsigned long *addr)
{
int ret;
*ksymname = strsep(&input_string, ":");
*addr = kallsyms_lookup_name(*ksymname);
/* Check for malformed request: (2), (1) and (5) */
if ((!input_string) ||
(strlen(input_string) != KSYM_TRACER_OP_LEN) ||
(*addr == 0))
return -EINVAL;;
ret = ksym_trace_get_access_type(input_string);
return ret;
}
int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
{
struct trace_ksym *entry;
int ret = -ENOMEM;
if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
" new requests for tracing can be accepted now.\n",
KSYM_TRACER_MAX);
return -ENOSPC;
}
entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
if (!entry)
return -ENOMEM;
entry->type = op;
entry->ksym_addr = addr;
entry->len = HW_BREAKPOINT_LEN_4;
ret = -EAGAIN;
entry->ksym_hbp = register_wide_hw_breakpoint(entry->ksym_addr,
entry->len, entry->type,
ksym_hbp_handler, true);
if (IS_ERR(entry->ksym_hbp)) {
entry->ksym_hbp = NULL;
ret = PTR_ERR(entry->ksym_hbp);
}
if (!entry->ksym_hbp) {
printk(KERN_INFO "ksym_tracer request failed. Try again"
" later!!\n");
goto err;
}
hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
ksym_filter_entry_count++;
return 0;
err:
kfree(entry);
return ret;
}
static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
struct trace_ksym *entry;
struct hlist_node *node;
struct trace_seq *s;
ssize_t cnt = 0;
int ret;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
trace_seq_init(s);
mutex_lock(&ksym_tracer_mutex);
hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
ret = trace_seq_printf(s, "%pS:", (void *)entry->ksym_addr);
if (entry->type == HW_BREAKPOINT_R)
ret = trace_seq_puts(s, "r--\n");
else if (entry->type == HW_BREAKPOINT_W)
ret = trace_seq_puts(s, "-w-\n");
else if (entry->type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
ret = trace_seq_puts(s, "rw-\n");
WARN_ON_ONCE(!ret);
}
cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
mutex_unlock(&ksym_tracer_mutex);
kfree(s);
return cnt;
}
static void __ksym_trace_reset(void)
{
struct trace_ksym *entry;
struct hlist_node *node, *node1;
mutex_lock(&ksym_tracer_mutex);
hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
ksym_hlist) {
unregister_wide_hw_breakpoint(entry->ksym_hbp);
ksym_filter_entry_count--;
hlist_del_rcu(&(entry->ksym_hlist));
synchronize_rcu();
kfree(entry);
}
mutex_unlock(&ksym_tracer_mutex);
}
static ssize_t ksym_trace_filter_write(struct file *file,
const char __user *buffer,
size_t count, loff_t *ppos)
{
struct trace_ksym *entry;
struct hlist_node *node;
char *input_string, *ksymname = NULL;
unsigned long ksym_addr = 0;
int ret, op, changed = 0;
input_string = kzalloc(count + 1, GFP_KERNEL);
if (!input_string)
return -ENOMEM;
if (copy_from_user(input_string, buffer, count)) {
kfree(input_string);
return -EFAULT;
}
input_string[count] = '\0';
strstrip(input_string);
/*
* Clear all breakpoints if:
* 1: echo > ksym_trace_filter
* 2: echo 0 > ksym_trace_filter
* 3: echo "*:---" > ksym_trace_filter
*/
if (!input_string[0] || !strcmp(input_string, "0") ||
!strcmp(input_string, "*:---")) {
__ksym_trace_reset();
kfree(input_string);
return count;
}
ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
if (ret < 0) {
kfree(input_string);
return ret;
}
mutex_lock(&ksym_tracer_mutex);
ret = -EINVAL;
hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
if (entry->ksym_addr == ksym_addr) {
/* Check for malformed request: (6) */
if (entry->type != op)
changed = 1;
else
goto out;
break;
}
}
if (changed) {
unregister_wide_hw_breakpoint(entry->ksym_hbp);
entry->type = op;
if (op > 0) {
entry->ksym_hbp =
register_wide_hw_breakpoint(entry->ksym_addr,
entry->len, entry->type,
ksym_hbp_handler, true);
if (IS_ERR(entry->ksym_hbp))
entry->ksym_hbp = NULL;
if (!entry->ksym_hbp)
goto out;
}
ksym_filter_entry_count--;
hlist_del_rcu(&(entry->ksym_hlist));
synchronize_rcu();
kfree(entry);
ret = 0;
goto out;
} else {
/* Check for malformed request: (4) */
if (op == 0)
goto out;
ret = process_new_ksym_entry(ksymname, op, ksym_addr);
}
out:
mutex_unlock(&ksym_tracer_mutex);
kfree(input_string);
if (!ret)
ret = count;
return ret;
}
static const struct file_operations ksym_tracing_fops = {
.open = tracing_open_generic,
.read = ksym_trace_filter_read,
.write = ksym_trace_filter_write,
};
static void ksym_trace_reset(struct trace_array *tr)
{
ksym_tracing_enabled = 0;
__ksym_trace_reset();
}
static int ksym_trace_init(struct trace_array *tr)
{
int cpu, ret = 0;
for_each_online_cpu(cpu)
tracing_reset(tr, cpu);
ksym_tracing_enabled = 1;
ksym_trace_array = tr;
return ret;
}
static void ksym_trace_print_header(struct seq_file *m)
{
seq_puts(m,
"# TASK-PID CPU# Symbol "
"Type Function\n");
seq_puts(m,
"# | | | "
" | |\n");
}
static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
{
struct trace_entry *entry = iter->ent;
struct trace_seq *s = &iter->seq;
struct ksym_trace_entry *field;
char str[KSYM_SYMBOL_LEN];
int ret;
if (entry->type != TRACE_KSYM)
return TRACE_TYPE_UNHANDLED;
trace_assign_type(field, entry);
ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
entry->pid, iter->cpu, (char *)field->addr);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
switch (field->type) {
case HW_BREAKPOINT_R:
ret = trace_seq_printf(s, " R ");
break;
case HW_BREAKPOINT_W:
ret = trace_seq_printf(s, " W ");
break;
case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
ret = trace_seq_printf(s, " RW ");
break;
default:
return TRACE_TYPE_PARTIAL_LINE;
}
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
sprint_symbol(str, field->ip);
ret = trace_seq_printf(s, "%s\n", str);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
}
struct tracer ksym_tracer __read_mostly =
{
.name = "ksym_tracer",
.init = ksym_trace_init,
.reset = ksym_trace_reset,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_ksym,
#endif
.print_header = ksym_trace_print_header,
.print_line = ksym_trace_output
};
__init static int init_ksym_trace(void)
{
struct dentry *d_tracer;
struct dentry *entry;
d_tracer = tracing_init_dentry();
ksym_filter_entry_count = 0;
entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
NULL, &ksym_tracing_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'ksym_trace_filter' file\n");
return register_tracer(&ksym_tracer);
}
device_initcall(init_ksym_trace);
#ifdef CONFIG_PROFILE_KSYM_TRACER
static int ksym_tracer_stat_headers(struct seq_file *m)
{
seq_puts(m, " Access Type ");
seq_puts(m, " Symbol Counter\n");
seq_puts(m, " ----------- ");
seq_puts(m, " ------ -------\n");
return 0;
}
static int ksym_tracer_stat_show(struct seq_file *m, void *v)
{
struct hlist_node *stat = v;
struct trace_ksym *entry;
int access_type = 0;
char fn_name[KSYM_NAME_LEN];
entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
access_type = entry->type;
switch (access_type) {
case HW_BREAKPOINT_R:
seq_puts(m, " R ");
break;
case HW_BREAKPOINT_W:
seq_puts(m, " W ");
break;
case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
seq_puts(m, " RW ");
break;
default:
seq_puts(m, " NA ");
}
if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0)
seq_printf(m, " %-36s", fn_name);
else
seq_printf(m, " %-36s", "<NA>");
seq_printf(m, " %15lu\n", entry->counter);
return 0;
}
static void *ksym_tracer_stat_start(struct tracer_stat *trace)
{
return ksym_filter_head.first;
}
static void *
ksym_tracer_stat_next(void *v, int idx)
{
struct hlist_node *stat = v;
return stat->next;
}
static struct tracer_stat ksym_tracer_stats = {
.name = "ksym_tracer",
.stat_start = ksym_tracer_stat_start,
.stat_next = ksym_tracer_stat_next,
.stat_headers = ksym_tracer_stat_headers,
.stat_show = ksym_tracer_stat_show
};
__init static int ksym_tracer_stat_init(void)
{
int ret;
ret = register_stat_tracer(&ksym_tracer_stats);
if (ret) {
printk(KERN_WARNING "Warning: could not register "
"ksym tracer stats\n");
return 1;
}
return 0;
}
fs_initcall(ksym_tracer_stat_init);
#endif /* CONFIG_PROFILE_KSYM_TRACER */
...@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) ...@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
case TRACE_GRAPH_ENT: case TRACE_GRAPH_ENT:
case TRACE_GRAPH_RET: case TRACE_GRAPH_RET:
case TRACE_HW_BRANCHES: case TRACE_HW_BRANCHES:
case TRACE_KSYM:
return 1; return 1;
} }
return 0; return 0;
...@@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace, ...@@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
return ret; return ret;
} }
#endif /* CONFIG_HW_BRANCH_TRACER */ #endif /* CONFIG_HW_BRANCH_TRACER */
#ifdef CONFIG_KSYM_TRACER
static int ksym_selftest_dummy;
int
trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
{
unsigned long count;
int ret;
/* start the tracing */
ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
return ret;
}
ksym_selftest_dummy = 0;
/* Register the read-write tracing request */
ret = process_new_ksym_entry("ksym_selftest_dummy",
HW_BREAKPOINT_R | HW_BREAKPOINT_W,
(unsigned long)(&ksym_selftest_dummy));
if (ret < 0) {
printk(KERN_CONT "ksym_trace read-write startup test failed\n");
goto ret_path;
}
/* Perform a read and a write operation over the dummy variable to
* trigger the tracer
*/
if (ksym_selftest_dummy == 0)
ksym_selftest_dummy++;
/* stop the tracing. */
tracing_stop();
/* check the trace buffer */
ret = trace_test_buffer(tr, &count);
trace->reset(tr);
tracing_start();
/* read & write operations - one each is performed on the dummy variable
* triggering two entries in the trace buffer
*/
if (!ret && count != 2) {
printk(KERN_CONT "Ksym tracer startup test failed");
ret = -1;
}
ret_path:
return ret;
}
#endif /* CONFIG_KSYM_TRACER */
...@@ -40,5 +40,11 @@ config SAMPLE_KRETPROBES ...@@ -40,5 +40,11 @@ config SAMPLE_KRETPROBES
default m default m
depends on SAMPLE_KPROBES && KRETPROBES depends on SAMPLE_KPROBES && KRETPROBES
config SAMPLE_HW_BREAKPOINT
tristate "Build kernel hardware breakpoint examples -- loadable module only"
depends on HAVE_HW_BREAKPOINT && m
help
This builds kernel hardware breakpoint example modules.
endif # SAMPLES endif # SAMPLES
# Makefile for Linux samples code # Makefile for Linux samples code
obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ tracepoints/ trace_events/ obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ tracepoints/ trace_events/ \
hw_breakpoint/
obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o
/*
* data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* usage: insmod data_breakpoint.ko ksym=<ksym_name>
*
* This file is a kernel module that places a breakpoint over ksym_name kernel
* variable using Hardware Breakpoint register. The corresponding handler which
* prints a backtrace is invoked everytime a write operation is performed on
* that variable.
*
* Copyright (C) IBM Corporation, 2009
*/
#include <linux/module.h> /* Needed by all modules */
#include <linux/kernel.h> /* Needed for KERN_INFO */
#include <linux/init.h> /* Needed for the macros */
#include <linux/kallsyms.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
struct perf_event **sample_hbp;
static char ksym_name[KSYM_NAME_LEN] = "pid_max";
module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO);
MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any"
" write operations on the kernel symbol");
static void sample_hbp_handler(struct perf_event *temp, void *data)
{
printk(KERN_INFO "%s value is changed\n", ksym_name);
dump_stack();
printk(KERN_INFO "Dump stack from sample_hbp_handler\n");
}
static int __init hw_break_module_init(void)
{
int ret;
unsigned long addr;
addr = kallsyms_lookup_name(ksym_name);
sample_hbp = register_wide_hw_breakpoint(addr, HW_BREAKPOINT_LEN_4,
HW_BREAKPOINT_W | HW_BREAKPOINT_R,
sample_hbp_handler, true);
if (IS_ERR(sample_hbp)) {
ret = PTR_ERR(sample_hbp);
goto fail;
} else if (!sample_hbp) {
ret = -EINVAL;
goto fail;
}
printk(KERN_INFO "HW Breakpoint for %s write installed\n", ksym_name);
return 0;
fail:
printk(KERN_INFO "Breakpoint registration failed\n");
return ret;
}
static void __exit hw_break_module_exit(void)
{
unregister_wide_hw_breakpoint(sample_hbp);
printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name);
}
module_init(hw_break_module_init);
module_exit(hw_break_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("K.Prasad");
MODULE_DESCRIPTION("ksym breakpoint");
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment