Commit 590db42d authored by Kan Liang's avatar Kan Liang Committed by Arnaldo Carvalho de Melo

perf report: Support instruction latency

The instruction latency information can be recorded on some platforms,
e.g., the Intel Sapphire Rapids server. With both memory latency
(weight) and the new instruction latency information, users can easily
locate the expensive load instructions, and also understand the time
spent in different stages. The users can optimize their applications in
different pipeline stages.

The 'weight' field is shared among different architectures. Reusing the
'weight' field may impacts other architectures. Add a new field to store
the instruction latency.

Like the 'weight' support, introduce a 'ins_lat' for the global
instruction latency, and a 'local_ins_lat' for the local instruction
latency version.

Add new sort functions, INSTR Latency and Local INSTR Latency,
accordingly.

Add local_ins_lat to the default_mem_sort_order[].
Signed-off-by: default avatarKan Liang <kan.liang@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lore.kernel.org/lkml/1612296553-21962-7-git-send-email-kan.liang@linux.intel.comSigned-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent ea8d0ed6
...@@ -109,6 +109,9 @@ OPTIONS ...@@ -109,6 +109,9 @@ OPTIONS
- time: Separate the samples by time stamp with the resolution specified by - time: Separate the samples by time stamp with the resolution specified by
--time-quantum (default 100ms). Specify with overhead and before it. --time-quantum (default 100ms). Specify with overhead and before it.
- code_page_size: the code page size of sampled code address (ip) - code_page_size: the code page size of sampled code address (ip)
- ins_lat: Instruction latency in core cycles. This is the global instruction
latency
- local_ins_lat: Local instruction latency version
By default, comm, dso and symbol keys are used. By default, comm, dso and symbol keys are used.
(i.e. --sort comm,dso,symbol) (i.e. --sort comm,dso,symbol)
...@@ -155,7 +158,8 @@ OPTIONS ...@@ -155,7 +158,8 @@ OPTIONS
- blocked: reason of blocked load access for the data at the time of the sample - blocked: reason of blocked load access for the data at the time of the sample
And the default sort keys are changed to local_weight, mem, sym, dso, And the default sort keys are changed to local_weight, mem, sym, dso,
symbol_daddr, dso_daddr, snoop, tlb, locked, blocked, see '--mem-mode'. symbol_daddr, dso_daddr, snoop, tlb, locked, blocked, local_ins_lat,
see '--mem-mode'.
If the data file has tracepoint event(s), following (dynamic) sort keys If the data file has tracepoint event(s), following (dynamic) sort keys
are also available: are also available:
......
...@@ -142,6 +142,7 @@ struct perf_sample { ...@@ -142,6 +142,7 @@ struct perf_sample {
u16 insn_len; u16 insn_len;
u8 cpumode; u8 cpumode;
u16 misc; u16 misc;
u16 ins_lat;
bool no_hw_idx; /* No hw_idx collected in branch_stack */ bool no_hw_idx; /* No hw_idx collected in branch_stack */
char insn[MAX_INSN]; char insn[MAX_INSN];
void *raw_data; void *raw_data;
......
...@@ -2352,8 +2352,10 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event, ...@@ -2352,8 +2352,10 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
weight.full = *array; weight.full = *array;
if (type & PERF_SAMPLE_WEIGHT) if (type & PERF_SAMPLE_WEIGHT)
data->weight = weight.full; data->weight = weight.full;
else else {
data->weight = weight.var1_dw; data->weight = weight.var1_dw;
data->ins_lat = weight.var2_w;
}
array++; array++;
} }
......
...@@ -209,6 +209,8 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) ...@@ -209,6 +209,8 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
hists__new_col_len(hists, HISTC_LOCAL_WEIGHT, 12); hists__new_col_len(hists, HISTC_LOCAL_WEIGHT, 12);
hists__new_col_len(hists, HISTC_GLOBAL_WEIGHT, 12); hists__new_col_len(hists, HISTC_GLOBAL_WEIGHT, 12);
hists__new_col_len(hists, HISTC_MEM_BLOCKED, 10); hists__new_col_len(hists, HISTC_MEM_BLOCKED, 10);
hists__new_col_len(hists, HISTC_LOCAL_INS_LAT, 13);
hists__new_col_len(hists, HISTC_GLOBAL_INS_LAT, 13);
if (symbol_conf.nanosecs) if (symbol_conf.nanosecs)
hists__new_col_len(hists, HISTC_TIME, 16); hists__new_col_len(hists, HISTC_TIME, 16);
else else
...@@ -287,12 +289,13 @@ static long hist_time(unsigned long htime) ...@@ -287,12 +289,13 @@ static long hist_time(unsigned long htime)
} }
static void he_stat__add_period(struct he_stat *he_stat, u64 period, static void he_stat__add_period(struct he_stat *he_stat, u64 period,
u64 weight) u64 weight, u64 ins_lat)
{ {
he_stat->period += period; he_stat->period += period;
he_stat->weight += weight; he_stat->weight += weight;
he_stat->nr_events += 1; he_stat->nr_events += 1;
he_stat->ins_lat += ins_lat;
} }
static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src) static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
...@@ -304,6 +307,7 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src) ...@@ -304,6 +307,7 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
dest->period_guest_us += src->period_guest_us; dest->period_guest_us += src->period_guest_us;
dest->nr_events += src->nr_events; dest->nr_events += src->nr_events;
dest->weight += src->weight; dest->weight += src->weight;
dest->ins_lat += src->ins_lat;
} }
static void he_stat__decay(struct he_stat *he_stat) static void he_stat__decay(struct he_stat *he_stat)
...@@ -592,6 +596,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, ...@@ -592,6 +596,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
int64_t cmp; int64_t cmp;
u64 period = entry->stat.period; u64 period = entry->stat.period;
u64 weight = entry->stat.weight; u64 weight = entry->stat.weight;
u64 ins_lat = entry->stat.ins_lat;
bool leftmost = true; bool leftmost = true;
p = &hists->entries_in->rb_root.rb_node; p = &hists->entries_in->rb_root.rb_node;
...@@ -610,11 +615,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, ...@@ -610,11 +615,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
if (!cmp) { if (!cmp) {
if (sample_self) { if (sample_self) {
he_stat__add_period(&he->stat, period, weight); he_stat__add_period(&he->stat, period, weight, ins_lat);
hist_entry__add_callchain_period(he, period); hist_entry__add_callchain_period(he, period);
} }
if (symbol_conf.cumulate_callchain) if (symbol_conf.cumulate_callchain)
he_stat__add_period(he->stat_acc, period, weight); he_stat__add_period(he->stat_acc, period, weight, ins_lat);
/* /*
* This mem info was allocated from sample__resolve_mem * This mem info was allocated from sample__resolve_mem
...@@ -725,6 +730,7 @@ __hists__add_entry(struct hists *hists, ...@@ -725,6 +730,7 @@ __hists__add_entry(struct hists *hists,
.nr_events = 1, .nr_events = 1,
.period = sample->period, .period = sample->period,
.weight = sample->weight, .weight = sample->weight,
.ins_lat = sample->ins_lat,
}, },
.parent = sym_parent, .parent = sym_parent,
.filtered = symbol__parent_filter(sym_parent) | al->filtered, .filtered = symbol__parent_filter(sym_parent) | al->filtered,
......
...@@ -73,6 +73,8 @@ enum hist_column { ...@@ -73,6 +73,8 @@ enum hist_column {
HISTC_DSO_SIZE, HISTC_DSO_SIZE,
HISTC_SYMBOL_IPC, HISTC_SYMBOL_IPC,
HISTC_MEM_BLOCKED, HISTC_MEM_BLOCKED,
HISTC_LOCAL_INS_LAT,
HISTC_GLOBAL_INS_LAT,
HISTC_NR_COLS, /* Last entry */ HISTC_NR_COLS, /* Last entry */
}; };
......
...@@ -1871,9 +1871,10 @@ static int intel_pt_synth_pebs_sample(struct intel_pt_queue *ptq) ...@@ -1871,9 +1871,10 @@ static int intel_pt_synth_pebs_sample(struct intel_pt_queue *ptq)
* cycles. Use latency >> 32 to distinguish the * cycles. Use latency >> 32 to distinguish the
* different format of the mem access latency field. * different format of the mem access latency field.
*/ */
if (weight > 0) if (weight > 0) {
sample.weight = weight & 0xffff; sample.weight = weight & 0xffff;
else sample.ins_lat = items->mem_access_latency & 0xffff;
} else
sample.weight = items->mem_access_latency; sample.weight = items->mem_access_latency;
} }
if (!sample.weight && items->has_tsx_aux_info) { if (!sample.weight && items->has_tsx_aux_info) {
......
...@@ -1300,8 +1300,12 @@ static void dump_sample(struct evsel *evsel, union perf_event *event, ...@@ -1300,8 +1300,12 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
if (sample_type & PERF_SAMPLE_STACK_USER) if (sample_type & PERF_SAMPLE_STACK_USER)
stack_user__printf(&sample->user_stack); stack_user__printf(&sample->user_stack);
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
printf("... weight: %" PRIu64 "\n", sample->weight); printf("... weight: %" PRIu64 "", sample->weight);
if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
printf(",0x%"PRIx16"", sample->ins_lat);
printf("\n");
}
if (sample_type & PERF_SAMPLE_DATA_SRC) if (sample_type & PERF_SAMPLE_DATA_SRC)
printf(" . data_src: 0x%"PRIx64"\n", sample->data_src); printf(" . data_src: 0x%"PRIx64"\n", sample->data_src);
......
...@@ -36,7 +36,7 @@ const char default_parent_pattern[] = "^sys_|^do_page_fault"; ...@@ -36,7 +36,7 @@ const char default_parent_pattern[] = "^sys_|^do_page_fault";
const char *parent_pattern = default_parent_pattern; const char *parent_pattern = default_parent_pattern;
const char *default_sort_order = "comm,dso,symbol"; const char *default_sort_order = "comm,dso,symbol";
const char default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles"; const char default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles";
const char default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked"; const char default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat";
const char default_top_sort_order[] = "dso,symbol"; const char default_top_sort_order[] = "dso,symbol";
const char default_diff_sort_order[] = "dso,symbol"; const char default_diff_sort_order[] = "dso,symbol";
const char default_tracepoint_sort_order[] = "trace"; const char default_tracepoint_sort_order[] = "trace";
...@@ -1365,6 +1365,49 @@ struct sort_entry sort_global_weight = { ...@@ -1365,6 +1365,49 @@ struct sort_entry sort_global_weight = {
.se_width_idx = HISTC_GLOBAL_WEIGHT, .se_width_idx = HISTC_GLOBAL_WEIGHT,
}; };
static u64 he_ins_lat(struct hist_entry *he)
{
return he->stat.nr_events ? he->stat.ins_lat / he->stat.nr_events : 0;
}
static int64_t
sort__local_ins_lat_cmp(struct hist_entry *left, struct hist_entry *right)
{
return he_ins_lat(left) - he_ins_lat(right);
}
static int hist_entry__local_ins_lat_snprintf(struct hist_entry *he, char *bf,
size_t size, unsigned int width)
{
return repsep_snprintf(bf, size, "%-*u", width, he_ins_lat(he));
}
struct sort_entry sort_local_ins_lat = {
.se_header = "Local INSTR Latency",
.se_cmp = sort__local_ins_lat_cmp,
.se_snprintf = hist_entry__local_ins_lat_snprintf,
.se_width_idx = HISTC_LOCAL_INS_LAT,
};
static int64_t
sort__global_ins_lat_cmp(struct hist_entry *left, struct hist_entry *right)
{
return left->stat.ins_lat - right->stat.ins_lat;
}
static int hist_entry__global_ins_lat_snprintf(struct hist_entry *he, char *bf,
size_t size, unsigned int width)
{
return repsep_snprintf(bf, size, "%-*u", width, he->stat.ins_lat);
}
struct sort_entry sort_global_ins_lat = {
.se_header = "INSTR Latency",
.se_cmp = sort__global_ins_lat_cmp,
.se_snprintf = hist_entry__global_ins_lat_snprintf,
.se_width_idx = HISTC_GLOBAL_INS_LAT,
};
struct sort_entry sort_mem_daddr_sym = { struct sort_entry sort_mem_daddr_sym = {
.se_header = "Data Symbol", .se_header = "Data Symbol",
.se_cmp = sort__daddr_cmp, .se_cmp = sort__daddr_cmp,
...@@ -1796,6 +1839,8 @@ static struct sort_dimension common_sort_dimensions[] = { ...@@ -1796,6 +1839,8 @@ static struct sort_dimension common_sort_dimensions[] = {
DIM(SORT_SYM_IPC_NULL, "ipc_null", sort_sym_ipc_null), DIM(SORT_SYM_IPC_NULL, "ipc_null", sort_sym_ipc_null),
DIM(SORT_TIME, "time", sort_time), DIM(SORT_TIME, "time", sort_time),
DIM(SORT_CODE_PAGE_SIZE, "code_page_size", sort_code_page_size), DIM(SORT_CODE_PAGE_SIZE, "code_page_size", sort_code_page_size),
DIM(SORT_LOCAL_INS_LAT, "local_ins_lat", sort_local_ins_lat),
DIM(SORT_GLOBAL_INS_LAT, "ins_lat", sort_global_ins_lat),
}; };
#undef DIM #undef DIM
......
...@@ -50,6 +50,7 @@ struct he_stat { ...@@ -50,6 +50,7 @@ struct he_stat {
u64 period_guest_sys; u64 period_guest_sys;
u64 period_guest_us; u64 period_guest_us;
u64 weight; u64 weight;
u64 ins_lat;
u32 nr_events; u32 nr_events;
}; };
...@@ -231,6 +232,8 @@ enum sort_type { ...@@ -231,6 +232,8 @@ enum sort_type {
SORT_SYM_IPC_NULL, SORT_SYM_IPC_NULL,
SORT_TIME, SORT_TIME,
SORT_CODE_PAGE_SIZE, SORT_CODE_PAGE_SIZE,
SORT_LOCAL_INS_LAT,
SORT_GLOBAL_INS_LAT,
/* branch stack specific sort keys */ /* branch stack specific sort keys */
__SORT_BRANCH_STACK, __SORT_BRANCH_STACK,
......
...@@ -1644,8 +1644,10 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo ...@@ -1644,8 +1644,10 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo
if (type & PERF_SAMPLE_WEIGHT_TYPE) { if (type & PERF_SAMPLE_WEIGHT_TYPE) {
*array = sample->weight; *array = sample->weight;
if (type & PERF_SAMPLE_WEIGHT_STRUCT) if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
*array &= 0xffffffff; *array &= 0xffffffff;
*array |= ((u64)sample->ins_lat << 32);
}
array++; array++;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment