Commit 12c08a9f authored by Stephane Eranian's avatar Stephane Eranian Committed by Arnaldo Carvalho de Melo

perf stat: Add per-core aggregation

This patch adds the --per-core option to perf stat.

This option is used to aggregate system-wide counts
on a per physical core basis. On processors with
hyperthreading, this means counts of all HT threads
running on a physical core are aggregated.

This mode is useful to find imblance between physical
cores running an uniform workload. Cores are identified
by socket: S0-C1, means physical core 1 on socket 0. Note
that cores are identified using their physical core id,
thus their numbering may not be continuous.

Per core aggregation can be combined with interval printing:

 # perf stat -a --per-core -I 1000 -e cycles sleep 1000
 #           time core         cpus             counts events
      1.000090030 S0-C0           1          4,765,747 cycles
      1.000090030 S0-C1           1          5,580,647 cycles
      1.000090030 S0-C2           1            221,181 cycles
      1.000090030 S0-C3           1            266,092 cycles
Signed-off-by: default avatarStephane Eranian <eranian@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1360846649-6411-4-git-send-email-eranian@google.com
[ committer note: Remove parts already applied on 86ee6e18 to keep bisectability ]
Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent d4304958
...@@ -126,6 +126,12 @@ use --per-socket in addition to -a. (system-wide). The output includes the ...@@ -126,6 +126,12 @@ use --per-socket in addition to -a. (system-wide). The output includes the
socket number and the number of online processors on that socket. This is socket number and the number of online processors on that socket. This is
useful to gauge the amount of aggregation. useful to gauge the amount of aggregation.
--per-core::
Aggregate counts per physical processor for system-wide mode measurements. This
is a useful mode to detect imbalance between physical cores. To enable this mode,
use --per-core in addition to -a. (system-wide). The output includes the
core number and the number of online logical processors on that physical processor.
EXAMPLES EXAMPLES
-------- --------
......
...@@ -80,6 +80,7 @@ enum aggr_mode { ...@@ -80,6 +80,7 @@ enum aggr_mode {
AGGR_NONE, AGGR_NONE,
AGGR_GLOBAL, AGGR_GLOBAL,
AGGR_SOCKET, AGGR_SOCKET,
AGGR_CORE,
}; };
static int run_count = 1; static int run_count = 1;
...@@ -384,6 +385,9 @@ static void print_interval(void) ...@@ -384,6 +385,9 @@ static void print_interval(void)
case AGGR_SOCKET: case AGGR_SOCKET:
fprintf(output, "# time socket cpus counts events\n"); fprintf(output, "# time socket cpus counts events\n");
break; break;
case AGGR_CORE:
fprintf(output, "# time core cpus counts events\n");
break;
case AGGR_NONE: case AGGR_NONE:
fprintf(output, "# time CPU counts events\n"); fprintf(output, "# time CPU counts events\n");
break; break;
...@@ -397,6 +401,7 @@ static void print_interval(void) ...@@ -397,6 +401,7 @@ static void print_interval(void)
num_print_interval = 0; num_print_interval = 0;
switch (aggr_mode) { switch (aggr_mode) {
case AGGR_CORE:
case AGGR_SOCKET: case AGGR_SOCKET:
print_aggr(prefix); print_aggr(prefix);
break; break;
...@@ -566,13 +571,23 @@ static void print_noise(struct perf_evsel *evsel, double avg) ...@@ -566,13 +571,23 @@ static void print_noise(struct perf_evsel *evsel, double avg)
print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); print_noise_pct(stddev_stats(&ps->res_stats[0]), avg);
} }
static void aggr_printout(struct perf_evsel *evsel, int cpu, int nr) static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
{ {
switch (aggr_mode) { switch (aggr_mode) {
case AGGR_CORE:
fprintf(output, "S%d-C%*d%s%*d%s",
cpu_map__id_to_socket(id),
csv_output ? 0 : -8,
cpu_map__id_to_cpu(id),
csv_sep,
csv_output ? 0 : 4,
nr,
csv_sep);
break;
case AGGR_SOCKET: case AGGR_SOCKET:
fprintf(output, "S%*d%s%*d%s", fprintf(output, "S%*d%s%*d%s",
csv_output ? 0 : -5, csv_output ? 0 : -5,
cpu, id,
csv_sep, csv_sep,
csv_output ? 0 : 4, csv_output ? 0 : 4,
nr, nr,
...@@ -581,7 +596,7 @@ static void aggr_printout(struct perf_evsel *evsel, int cpu, int nr) ...@@ -581,7 +596,7 @@ static void aggr_printout(struct perf_evsel *evsel, int cpu, int nr)
case AGGR_NONE: case AGGR_NONE:
fprintf(output, "CPU%*d%s", fprintf(output, "CPU%*d%s",
csv_output ? 0 : -4, csv_output ? 0 : -4,
perf_evsel__cpus(evsel)->map[cpu], csv_sep); perf_evsel__cpus(evsel)->map[id], csv_sep);
break; break;
case AGGR_GLOBAL: case AGGR_GLOBAL:
default: default:
...@@ -1095,6 +1110,7 @@ static void print_stat(int argc, const char **argv) ...@@ -1095,6 +1110,7 @@ static void print_stat(int argc, const char **argv)
} }
switch (aggr_mode) { switch (aggr_mode) {
case AGGR_CORE:
case AGGR_SOCKET: case AGGR_SOCKET:
print_aggr(NULL); print_aggr(NULL);
break; break;
...@@ -1163,6 +1179,13 @@ static int perf_stat_init_aggr_mode(void) ...@@ -1163,6 +1179,13 @@ static int perf_stat_init_aggr_mode(void)
} }
aggr_get_id = cpu_map__get_socket; aggr_get_id = cpu_map__get_socket;
break; break;
case AGGR_CORE:
if (cpu_map__build_core_map(evsel_list->cpus, &aggr_map)) {
perror("cannot build core map");
return -1;
}
aggr_get_id = cpu_map__get_core;
break;
case AGGR_NONE: case AGGR_NONE:
case AGGR_GLOBAL: case AGGR_GLOBAL:
default: default:
...@@ -1372,6 +1395,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) ...@@ -1372,6 +1395,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
"print counts at regular interval in ms (>= 100)"), "print counts at regular interval in ms (>= 100)"),
OPT_SET_UINT(0, "per-socket", &aggr_mode, OPT_SET_UINT(0, "per-socket", &aggr_mode,
"aggregate counts per processor socket", AGGR_SOCKET), "aggregate counts per processor socket", AGGR_SOCKET),
OPT_SET_UINT(0, "per-core", &aggr_mode,
"aggregate counts per physical processor core", AGGR_CORE),
OPT_END() OPT_END()
}; };
const char * const stat_usage[] = { const char * const stat_usage[] = {
......
...@@ -267,7 +267,53 @@ static int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, ...@@ -267,7 +267,53 @@ static int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
return 0; return 0;
} }
int cpu_map__get_core(struct cpu_map *map, int idx)
{
FILE *fp;
const char *mnt;
char path[PATH_MAX];
int cpu, ret, s;
if (idx > map->nr)
return -1;
cpu = map->map[idx];
mnt = sysfs_find_mountpoint();
if (!mnt)
return -1;
snprintf(path, PATH_MAX,
"%s/devices/system/cpu/cpu%d/topology/core_id",
mnt, cpu);
fp = fopen(path, "r");
if (!fp)
return -1;
ret = fscanf(fp, "%d", &cpu);
fclose(fp);
if (ret != 1)
return -1;
s = cpu_map__get_socket(map, idx);
if (s == -1)
return -1;
/*
* encode socket in upper 16 bits
* core_id is relative to socket, and
* we need a global id. So we combine
* socket+ core id
*/
return (s << 16) | (cpu & 0xffff);
}
int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp) int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp)
{ {
return cpu_map__build_map(cpus, sockp, cpu_map__get_socket); return cpu_map__build_map(cpus, sockp, cpu_map__get_socket);
} }
int cpu_map__build_core_map(struct cpu_map *cpus, struct cpu_map **corep)
{
return cpu_map__build_map(cpus, corep, cpu_map__get_core);
}
...@@ -15,7 +15,9 @@ void cpu_map__delete(struct cpu_map *map); ...@@ -15,7 +15,9 @@ void cpu_map__delete(struct cpu_map *map);
struct cpu_map *cpu_map__read(FILE *file); struct cpu_map *cpu_map__read(FILE *file);
size_t cpu_map__fprintf(struct cpu_map *map, FILE *fp); size_t cpu_map__fprintf(struct cpu_map *map, FILE *fp);
int cpu_map__get_socket(struct cpu_map *map, int idx); int cpu_map__get_socket(struct cpu_map *map, int idx);
int cpu_map__get_core(struct cpu_map *map, int idx);
int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp); int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp);
int cpu_map__build_core_map(struct cpu_map *cpus, struct cpu_map **corep);
static inline int cpu_map__socket(struct cpu_map *sock, int s) static inline int cpu_map__socket(struct cpu_map *sock, int s)
{ {
...@@ -24,6 +26,16 @@ static inline int cpu_map__socket(struct cpu_map *sock, int s) ...@@ -24,6 +26,16 @@ static inline int cpu_map__socket(struct cpu_map *sock, int s)
return sock->map[s]; return sock->map[s];
} }
static inline int cpu_map__id_to_socket(int id)
{
return id >> 16;
}
static inline int cpu_map__id_to_cpu(int id)
{
return id & 0xffff;
}
static inline int cpu_map__nr(const struct cpu_map *map) static inline int cpu_map__nr(const struct cpu_map *map)
{ {
return map ? map->nr : 1; return map ? map->nr : 1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment