Commit 69766c40 authored by Ingo Molnar's avatar Ingo Molnar

Merge tag 'perf-urgent-for-mingo-20160809' of...

Merge tag 'perf-urgent-for-mingo-20160809' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/urgent

Pull perf/urgent fixes from Arnaldo Carvalho de Melo:

User visible fixes:

- Fix the lookup for a kernel module in 'perf probe', fixing for instance, the
  erroneous return of "[raid10]" when looking for "[raid1]"  (Konstantin Khlebnikov)

- Disable counters in a group before reading them in 'perf stat', to avoid skew (Mark Rutland)

- Fix adding probes to function aliases in systems using kaslr (Masami Hiramatsu)

- Trip libtraceevent trace_seq buffers, removing unnecessary memory usage that could
  bring a system using tracepoint events with 'perf top' to a crawl, as the trace_seq
  buffers start at a whooping 4 KB, which is very rarely used in perf's usecases,
  so realloc it to the really used space as a last measure after using libtraceevent
  functions to format the fields of tracepoint events (Arnaldo Carvalho de Melo)

- Fix 'perf probe' location when using DWARF on ppc64le (Ravi Bangoria)

- Allow specifying signedness casts to a 'perf probe' variable, to shorten
  the number of steps to see signed values that otherwise would always appear
  as hex values (Naohiro Aota)

Documentation fixes:

- Add 'bpf-output' field to 'perf script' usage message (Brendan Gregg)

Infrastructure fixes:

- Sync kernel header files: cpufeatures.h, {disabled,required}-features.h,
  bpf.h and vmx.h, so that we get a clean build, without warnings about files
  being different from the kernel counterparts.

  A verification of the need or desirability of changes in tools/ based on what
  was done in the kernel changesets was made and documented in the respective
  file sync changesets (Arnaldo Carvalho de Melo)
Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parents 81abf252 99e608b5
......@@ -225,7 +225,6 @@
#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */
#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
......@@ -301,10 +300,6 @@
#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
#define X86_BUG_NULL_SEG X86_BUG(9) /* Nulling a selector preserves the base */
#define X86_BUG_SWAPGS_FENCE X86_BUG(10) /* SWAPGS without input dep on GS */
#ifdef CONFIG_X86_32
/*
* 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional
......@@ -312,5 +307,7 @@
*/
#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
#endif
#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */
#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
#endif /* _ASM_X86_CPUFEATURES_H */
......@@ -56,5 +56,7 @@
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE)
#define DISABLED_MASK17 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
#endif /* _ASM_X86_DISABLED_FEATURES_H */
......@@ -99,5 +99,7 @@
#define REQUIRED_MASK14 0
#define REQUIRED_MASK15 0
#define REQUIRED_MASK16 0
#define REQUIRED_MASK17 0
#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
#endif /* _ASM_X86_REQUIRED_FEATURES_H */
......@@ -78,7 +78,6 @@
#define EXIT_REASON_PML_FULL 62
#define EXIT_REASON_XSAVES 63
#define EXIT_REASON_XRSTORS 64
#define EXIT_REASON_PCOMMIT 65
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
......@@ -127,8 +126,7 @@
{ EXIT_REASON_INVVPID, "INVVPID" }, \
{ EXIT_REASON_INVPCID, "INVPCID" }, \
{ EXIT_REASON_XSAVES, "XSAVES" }, \
{ EXIT_REASON_XRSTORS, "XRSTORS" }, \
{ EXIT_REASON_PCOMMIT, "PCOMMIT" }
{ EXIT_REASON_XRSTORS, "XRSTORS" }
#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
......
......@@ -84,6 +84,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_PERCPU_HASH,
BPF_MAP_TYPE_PERCPU_ARRAY,
BPF_MAP_TYPE_STACK_TRACE,
BPF_MAP_TYPE_CGROUP_ARRAY,
};
enum bpf_prog_type {
......@@ -93,6 +94,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SCHED_CLS,
BPF_PROG_TYPE_SCHED_ACT,
BPF_PROG_TYPE_TRACEPOINT,
BPF_PROG_TYPE_XDP,
};
#define BPF_PSEUDO_MAP_FD 1
......@@ -313,6 +315,66 @@ enum bpf_func_id {
*/
BPF_FUNC_skb_get_tunnel_opt,
BPF_FUNC_skb_set_tunnel_opt,
/**
* bpf_skb_change_proto(skb, proto, flags)
* Change protocol of the skb. Currently supported is
* v4 -> v6, v6 -> v4 transitions. The helper will also
* resize the skb. eBPF program is expected to fill the
* new headers via skb_store_bytes and lX_csum_replace.
* @skb: pointer to skb
* @proto: new skb->protocol type
* @flags: reserved
* Return: 0 on success or negative error
*/
BPF_FUNC_skb_change_proto,
/**
* bpf_skb_change_type(skb, type)
* Change packet type of skb.
* @skb: pointer to skb
* @type: new skb->pkt_type type
* Return: 0 on success or negative error
*/
BPF_FUNC_skb_change_type,
/**
* bpf_skb_in_cgroup(skb, map, index) - Check cgroup2 membership of skb
* @skb: pointer to skb
* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
* @index: index of the cgroup in the bpf_map
* Return:
* == 0 skb failed the cgroup2 descendant test
* == 1 skb succeeded the cgroup2 descendant test
* < 0 error
*/
BPF_FUNC_skb_in_cgroup,
/**
* bpf_get_hash_recalc(skb)
* Retrieve and possibly recalculate skb->hash.
* @skb: pointer to skb
* Return: hash
*/
BPF_FUNC_get_hash_recalc,
/**
* u64 bpf_get_current_task(void)
* Returns current task_struct
* Return: current
*/
BPF_FUNC_get_current_task,
/**
* bpf_probe_write_user(void *dst, void *src, int len)
* safely attempt to write to a location
* @dst: destination address in userspace
* @src: source address on stack
* @len: number of bytes to copy
* Return: 0 on success or negative error
*/
BPF_FUNC_probe_write_user,
__BPF_FUNC_MAX_ID,
};
......@@ -347,9 +409,11 @@ enum bpf_func_id {
#define BPF_F_ZERO_CSUM_TX (1ULL << 1)
#define BPF_F_DONT_FRAGMENT (1ULL << 2)
/* BPF_FUNC_perf_event_output flags. */
/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */
#define BPF_F_INDEX_MASK 0xffffffffULL
#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK
/* BPF_FUNC_perf_event_output for sk_buff input context. */
#define BPF_F_CTXLEN_MASK (0xfffffULL << 32)
/* user accessible mirror of in-kernel sk_buff.
* new fields can only be added to the end of this structure
......@@ -386,4 +450,24 @@ struct bpf_tunnel_key {
__u32 tunnel_label;
};
/* User return codes for XDP prog type.
* A valid XDP program must return one of these defined values. All other
* return codes are reserved for future use. Unknown return codes will result
* in packet drop.
*/
enum xdp_action {
XDP_ABORTED = 0,
XDP_DROP,
XDP_PASS,
XDP_TX,
};
/* user accessible metadata for XDP packet hook
* new fields must be added to the end of this structure
*/
struct xdp_md {
__u32 data;
__u32 data_end;
};
#endif /* _UAPI__LINUX_BPF_H__ */
......@@ -176,10 +176,18 @@ Each probe argument follows below syntax.
'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), local array with fixed index (e.g. array[1], var->array[0], var->pointer[2]), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.)
'$vars' and '$params' special arguments are also available for NAME, '$vars' is expanded to the local variables (including function parameters) which can access at given probe point. '$params' is expanded to only the function parameters.
'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type.
'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), signedness casting (u/s), "string" and bitfield are supported. (see TYPES for detail)
On x86 systems %REG is always the short form of the register: for example %AX. %RAX or %EAX is not valid.
TYPES
-----
Basic types (u8/u16/u32/u64/s8/s16/s32/s64) are integer types. Prefix 's' and 'u' means those types are signed and unsigned respectively. Traced arguments are shown in decimal (signed) or hex (unsigned). You can also use 's' or 'u' to specify only signedness and leave its size auto-detected by perf probe.
String type is a special type, which fetches a "null-terminated" string from kernel space. This means it will fail and store NULL if the string container has been paged out. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type.
Bitfield is another special type, which takes 3 parameters, bit-width, bit-offset, and container-size (usually 32). The syntax is;
b<bit-width>@<bit-offset>/<container-size>
LINE SYNTAX
-----------
Line range is described by following syntax.
......
......@@ -116,8 +116,8 @@ OPTIONS
--fields::
Comma separated list of fields to print. Options are:
comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
srcline, period, iregs, brstack, brstacksym, flags.
Field list can be prepended with the type, trace, sw or hw,
srcline, period, iregs, brstack, brstacksym, flags, bpf-output,
callindent. Field list can be prepended with the type, trace, sw or hw,
to indicate to which event type the field list applies.
e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace
......
......@@ -54,10 +54,6 @@ int arch__compare_symbol_names(const char *namea, const char *nameb)
#endif
#if defined(_CALL_ELF) && _CALL_ELF == 2
bool arch__prefers_symtab(void)
{
return true;
}
#ifdef HAVE_LIBELF_SUPPORT
void arch__sym_update(struct symbol *s, GElf_Sym *sym)
......@@ -100,4 +96,27 @@ void arch__fix_tev_from_maps(struct perf_probe_event *pev,
tev->point.offset += lep_offset;
}
}
void arch__post_process_probe_trace_events(struct perf_probe_event *pev,
int ntevs)
{
struct probe_trace_event *tev;
struct map *map;
struct symbol *sym = NULL;
struct rb_node *tmp;
int i = 0;
map = get_target_map(pev->target, pev->uprobes);
if (!map || map__load(map, NULL) < 0)
return;
for (i = 0; i < ntevs; i++) {
tev = &pev->tevs[i];
map__for_each_symbol(map, sym, tmp) {
if (map->unmap_ip(map, sym->start) == tev->point.address)
arch__fix_tev_from_maps(pev, tev, map, sym);
}
}
}
#endif
......@@ -2116,7 +2116,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
"Valid types: hw,sw,trace,raw. "
"Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
"addr,symoff,period,iregs,brstack,brstacksym,flags,"
"callindent", parse_output_fields),
"bpf-output,callindent", parse_output_fields),
OPT_BOOLEAN('a', "all-cpus", &system_wide,
"system-wide collection from all CPUs"),
OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
......
......@@ -331,7 +331,7 @@ static int read_counter(struct perf_evsel *counter)
return 0;
}
static void read_counters(bool close_counters)
static void read_counters(void)
{
struct perf_evsel *counter;
......@@ -341,11 +341,6 @@ static void read_counters(bool close_counters)
if (perf_stat_process_counter(&stat_config, counter))
pr_warning("failed to process counter %s\n", counter->name);
if (close_counters) {
perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter),
thread_map__nr(evsel_list->threads));
}
}
}
......@@ -353,7 +348,7 @@ static void process_interval(void)
{
struct timespec ts, rs;
read_counters(false);
read_counters();
clock_gettime(CLOCK_MONOTONIC, &ts);
diff_timespec(&rs, &ts, &ref_time);
......@@ -380,6 +375,17 @@ static void enable_counters(void)
perf_evlist__enable(evsel_list);
}
static void disable_counters(void)
{
/*
* If we don't have tracee (attaching to task or cpu), counters may
* still be running. To get accurate group ratios, we must stop groups
* from counting before reading their constituent counters.
*/
if (!target__none(&target))
perf_evlist__disable(evsel_list);
}
static volatile int workload_exec_errno;
/*
......@@ -657,11 +663,20 @@ static int __run_perf_stat(int argc, const char **argv)
}
}
disable_counters();
t1 = rdclock();
update_stats(&walltime_nsecs_stats, t1 - t0);
read_counters(true);
/*
* Closing a group leader splits the group, and as we only disable
* group leaders, results in remaining events becoming enabled. To
* avoid arbitrary skew, we must read all counters before closing any
* group leaders.
*/
read_counters();
perf_evlist__close(evsel_list);
return WEXITSTATUS(status);
}
......
......@@ -170,15 +170,17 @@ static struct map *kernel_get_module_map(const char *module)
module = "kernel";
for (pos = maps__first(maps); pos; pos = map__next(pos)) {
/* short_name is "[module]" */
if (strncmp(pos->dso->short_name + 1, module,
pos->dso->short_name_len - 2) == 0) {
pos->dso->short_name_len - 2) == 0 &&
module[pos->dso->short_name_len - 2] == '\0') {
return pos;
}
}
return NULL;
}
static struct map *get_target_map(const char *target, bool user)
struct map *get_target_map(const char *target, bool user)
{
/* Init maps of given executable or kernel */
if (user)
......@@ -385,7 +387,7 @@ static int find_alternative_probe_point(struct debuginfo *dinfo,
if (uprobes)
address = sym->start;
else
address = map->unmap_ip(map, sym->start);
address = map->unmap_ip(map, sym->start) - map->reloc;
break;
}
if (!address) {
......@@ -664,22 +666,14 @@ static int add_module_to_probe_trace_events(struct probe_trace_event *tevs,
return ret;
}
/* Post processing the probe events */
static int post_process_probe_trace_events(struct probe_trace_event *tevs,
int ntevs, const char *module,
bool uprobe)
static int
post_process_kernel_probe_trace_events(struct probe_trace_event *tevs,
int ntevs)
{
struct ref_reloc_sym *reloc_sym;
char *tmp;
int i, skipped = 0;
if (uprobe)
return add_exec_to_probe_trace_events(tevs, ntevs, module);
/* Note that currently ref_reloc_sym based probe is not for drivers */
if (module)
return add_module_to_probe_trace_events(tevs, ntevs, module);
reloc_sym = kernel_get_ref_reloc_sym();
if (!reloc_sym) {
pr_warning("Relocated base symbol is not found!\n");
......@@ -711,6 +705,34 @@ static int post_process_probe_trace_events(struct probe_trace_event *tevs,
return skipped;
}
void __weak
arch__post_process_probe_trace_events(struct perf_probe_event *pev __maybe_unused,
int ntevs __maybe_unused)
{
}
/* Post processing the probe events */
static int post_process_probe_trace_events(struct perf_probe_event *pev,
struct probe_trace_event *tevs,
int ntevs, const char *module,
bool uprobe)
{
int ret;
if (uprobe)
ret = add_exec_to_probe_trace_events(tevs, ntevs, module);
else if (module)
/* Currently ref_reloc_sym based probe is not for drivers */
ret = add_module_to_probe_trace_events(tevs, ntevs, module);
else
ret = post_process_kernel_probe_trace_events(tevs, ntevs);
if (ret >= 0)
arch__post_process_probe_trace_events(pev, ntevs);
return ret;
}
/* Try to find perf_probe_event with debuginfo */
static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
struct probe_trace_event **tevs)
......@@ -749,7 +771,7 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
if (ntevs > 0) { /* Succeeded to find trace events */
pr_debug("Found %d probe_trace_events.\n", ntevs);
ret = post_process_probe_trace_events(*tevs, ntevs,
ret = post_process_probe_trace_events(pev, *tevs, ntevs,
pev->target, pev->uprobes);
if (ret < 0 || ret == ntevs) {
clear_probe_trace_events(*tevs, ntevs);
......@@ -2936,8 +2958,6 @@ static int try_to_find_absolute_address(struct perf_probe_event *pev,
return err;
}
bool __weak arch__prefers_symtab(void) { return false; }
/* Concatinate two arrays */
static void *memcat(void *a, size_t sz_a, void *b, size_t sz_b)
{
......@@ -3158,12 +3178,6 @@ static int convert_to_probe_trace_events(struct perf_probe_event *pev,
if (ret > 0 || pev->sdt) /* SDT can be found only in the cache */
return ret == 0 ? -ENOENT : ret; /* Found in probe cache */
if (arch__prefers_symtab() && !perf_probe_event_need_dwarf(pev)) {
ret = find_probe_trace_events_from_map(pev, tevs);
if (ret > 0)
return ret; /* Found in symbol table */
}
/* Convert perf_probe_event with debuginfo */
ret = try_to_find_probe_trace_events(pev, tevs);
if (ret != 0)
......
......@@ -158,7 +158,6 @@ int show_line_range(struct line_range *lr, const char *module, bool user);
int show_available_vars(struct perf_probe_event *pevs, int npevs,
struct strfilter *filter);
int show_available_funcs(const char *module, struct strfilter *filter, bool user);
bool arch__prefers_symtab(void);
void arch__fix_tev_from_maps(struct perf_probe_event *pev,
struct probe_trace_event *tev, struct map *map,
struct symbol *sym);
......@@ -173,4 +172,9 @@ int e_snprintf(char *str, size_t size, const char *format, ...)
int copy_to_probe_trace_arg(struct probe_trace_arg *tvar,
struct perf_probe_arg *pvar);
struct map *get_target_map(const char *target, bool user);
void arch__post_process_probe_trace_events(struct perf_probe_event *pev,
int ntevs);
#endif /*_PROBE_EVENT_H */
......@@ -297,10 +297,13 @@ static int convert_variable_type(Dwarf_Die *vr_die,
char sbuf[STRERR_BUFSIZE];
int bsize, boffs, total;
int ret;
char sign;
/* TODO: check all types */
if (cast && strcmp(cast, "string") != 0) {
if (cast && strcmp(cast, "string") != 0 &&
strcmp(cast, "s") != 0 && strcmp(cast, "u") != 0) {
/* Non string type is OK */
/* and respect signedness cast */
tvar->type = strdup(cast);
return (tvar->type == NULL) ? -ENOMEM : 0;
}
......@@ -361,6 +364,13 @@ static int convert_variable_type(Dwarf_Die *vr_die,
return (tvar->type == NULL) ? -ENOMEM : 0;
}
if (cast && (strcmp(cast, "u") == 0))
sign = 'u';
else if (cast && (strcmp(cast, "s") == 0))
sign = 's';
else
sign = die_is_signed_type(&type) ? 's' : 'u';
ret = dwarf_bytesize(&type);
if (ret <= 0)
/* No size ... try to use default type */
......@@ -373,8 +383,7 @@ static int convert_variable_type(Dwarf_Die *vr_die,
dwarf_diename(&type), MAX_BASIC_TYPE_BITS);
ret = MAX_BASIC_TYPE_BITS;
}
ret = snprintf(buf, 16, "%c%d",
die_is_signed_type(&type) ? 's' : 'u', ret);
ret = snprintf(buf, 16, "%c%d", sign, ret);
formatted:
if (ret < 0 || ret >= 16) {
......
......@@ -588,7 +588,11 @@ static char *get_trace_output(struct hist_entry *he)
} else {
pevent_event_info(&seq, evsel->tp_format, &rec);
}
return seq.buffer;
/*
* Trim the buffer, it starts at 4KB and we're not going to
* add anything more to this buffer.
*/
return realloc(seq.buffer, seq.len + 1);
}
static int64_t
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment