builtin-sched.c 44.4 KB
Newer Older
Ingo Molnar's avatar
Ingo Molnar committed
1
#include "builtin.h"
2
#include "perf.h"
Ingo Molnar's avatar
Ingo Molnar committed
3 4

#include "util/util.h"
5
#include "util/evlist.h"
Ingo Molnar's avatar
Ingo Molnar committed
6
#include "util/cache.h"
7
#include "util/evsel.h"
Ingo Molnar's avatar
Ingo Molnar committed
8 9 10
#include "util/symbol.h"
#include "util/thread.h"
#include "util/header.h"
11
#include "util/session.h"
12
#include "util/tool.h"
Ingo Molnar's avatar
Ingo Molnar committed
13 14

#include "util/parse-options.h"
15
#include "util/trace-event.h"
Ingo Molnar's avatar
Ingo Molnar committed
16 17 18

#include "util/debug.h"

19
#include <sys/prctl.h>
20
#include <sys/resource.h>
Ingo Molnar's avatar
Ingo Molnar committed
21

22 23 24
#include <semaphore.h>
#include <pthread.h>
#include <math.h>
25

26 27 28 29 30
#define PR_SET_NAME		15               /* Set process name */
#define MAX_CPUS		4096
#define COMM_LEN		20
#define SYM_LEN			129
#define MAX_PID			65536
31

32
struct sched_atom;
33

34 35 36 37
struct task_desc {
	unsigned long		nr;
	unsigned long		pid;
	char			comm[COMM_LEN];
38

39 40
	unsigned long		nr_events;
	unsigned long		curr_event;
41
	struct sched_atom	**atoms;
42 43 44

	pthread_t		thread;
	sem_t			sleep_sem;
45

46 47 48 49 50 51 52 53 54 55
	sem_t			ready_for_work;
	sem_t			work_done_sem;

	u64			cpu_usage;
};

enum sched_event_type {
	SCHED_EVENT_RUN,
	SCHED_EVENT_SLEEP,
	SCHED_EVENT_WAKEUP,
56
	SCHED_EVENT_MIGRATION,
57 58
};

59
struct sched_atom {
60
	enum sched_event_type	type;
61
	int			specific_wait;
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
	u64			timestamp;
	u64			duration;
	unsigned long		nr;
	sem_t			*wait_sem;
	struct task_desc	*wakee;
};

#define TASK_STATE_TO_CHAR_STR "RSDTtZX"

enum thread_state {
	THREAD_SLEEPING = 0,
	THREAD_WAIT_CPU,
	THREAD_SCHED_IN,
	THREAD_IGNORE
};

struct work_atom {
	struct list_head	list;
	enum thread_state	state;
81
	u64			sched_out_time;
82 83 84 85 86
	u64			wake_up_time;
	u64			sched_in_time;
	u64			runtime;
};

87 88
struct work_atoms {
	struct list_head	work_list;
89 90 91
	struct thread		*thread;
	struct rb_node		node;
	u64			max_lat;
92
	u64			max_lat_at;
93 94 95 96 97
	u64			total_lat;
	u64			nb_atoms;
	u64			total_runtime;
};

98
typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
99

100
struct perf_sched;
101

102 103 104
struct trace_sched_handler {
	int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);
105

106 107
	int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			     struct perf_sample *sample, struct machine *machine);
108

109 110
	int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine);
111

112 113
	int (*fork_event)(struct perf_sched *sched, struct perf_evsel *evsel,
			  struct perf_sample *sample);
114 115

	int (*migrate_task_event)(struct perf_sched *sched,
116 117 118
				  struct perf_evsel *evsel,
				  struct perf_sample *sample,
				  struct machine *machine);
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
};

struct perf_sched {
	struct perf_tool tool;
	const char	 *input_name;
	const char	 *sort_order;
	unsigned long	 nr_tasks;
	struct task_desc *pid_to_task[MAX_PID];
	struct task_desc **tasks;
	const struct trace_sched_handler *tp_handler;
	pthread_mutex_t	 start_work_mutex;
	pthread_mutex_t	 work_done_wait_mutex;
	int		 profile_cpu;
/*
 * Track the current task - that way we can know whether there's any
 * weird events, such as a task being switched away that is not current.
 */
	int		 max_cpu;
	u32		 curr_pid[MAX_CPUS];
	struct thread	 *curr_thread[MAX_CPUS];
	char		 next_shortname1;
	char		 next_shortname2;
	unsigned int	 replay_repeat;
	unsigned long	 nr_run_events;
	unsigned long	 nr_sleep_events;
	unsigned long	 nr_wakeup_events;
	unsigned long	 nr_sleep_corrections;
	unsigned long	 nr_run_events_optimized;
	unsigned long	 targetless_wakeups;
	unsigned long	 multitarget_wakeups;
	unsigned long	 nr_runs;
	unsigned long	 nr_timestamps;
	unsigned long	 nr_unordered_timestamps;
	unsigned long	 nr_state_machine_bugs;
	unsigned long	 nr_context_switch_bugs;
	unsigned long	 nr_events;
	unsigned long	 nr_lost_chunks;
	unsigned long	 nr_lost_events;
	u64		 run_measurement_overhead;
	u64		 sleep_measurement_overhead;
	u64		 start_time;
	u64		 cpu_usage;
	u64		 runavg_cpu_usage;
	u64		 parent_cpu_usage;
	u64		 runavg_parent_cpu_usage;
	u64		 sum_runtime;
	u64		 sum_fluct;
	u64		 run_avg;
	u64		 all_runtime;
	u64		 all_count;
	u64		 cpu_last_switched[MAX_CPUS];
	struct rb_root	 atom_root, sorted_atom_root;
	struct list_head sort_list, cmp_pid;
};
173 174

static u64 get_nsecs(void)
175 176 177 178 179 180 181 182
{
	struct timespec ts;

	clock_gettime(CLOCK_MONOTONIC, &ts);

	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
}

183
static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
184
{
185
	u64 T0 = get_nsecs(), T1;
186 187 188

	do {
		T1 = get_nsecs();
189
	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
190 191
}

192
static void sleep_nsecs(u64 nsecs)
193 194 195 196 197 198 199 200 201
{
	struct timespec ts;

	ts.tv_nsec = nsecs % 999999999;
	ts.tv_sec = nsecs / 999999999;

	nanosleep(&ts, NULL);
}

202
static void calibrate_run_measurement_overhead(struct perf_sched *sched)
203
{
204
	u64 T0, T1, delta, min_delta = 1000000000ULL;
205 206 207 208
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
209
		burn_nsecs(sched, 0);
210 211 212 213
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
214
	sched->run_measurement_overhead = min_delta;
215

216
	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
217 218
}

219
static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
220
{
221
	u64 T0, T1, delta, min_delta = 1000000000ULL;
222 223 224 225 226 227 228 229 230 231
	int i;

	for (i = 0; i < 10; i++) {
		T0 = get_nsecs();
		sleep_nsecs(10000);
		T1 = get_nsecs();
		delta = T1-T0;
		min_delta = min(min_delta, delta);
	}
	min_delta -= 10000;
232
	sched->sleep_measurement_overhead = min_delta;
233

234
	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
235 236
}

237
static struct sched_atom *
238
get_new_event(struct task_desc *task, u64 timestamp)
239
{
240
	struct sched_atom *event = zalloc(sizeof(*event));
241 242 243 244 245 246 247
	unsigned long idx = task->nr_events;
	size_t size;

	event->timestamp = timestamp;
	event->nr = idx;

	task->nr_events++;
248 249 250
	size = sizeof(struct sched_atom *) * task->nr_events;
	task->atoms = realloc(task->atoms, size);
	BUG_ON(!task->atoms);
251

252
	task->atoms[idx] = event;
253 254 255 256

	return event;
}

257
static struct sched_atom *last_event(struct task_desc *task)
258 259 260 261
{
	if (!task->nr_events)
		return NULL;

262
	return task->atoms[task->nr_events - 1];
263 264
}

265 266
static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
				u64 timestamp, u64 duration)
267
{
268
	struct sched_atom *event, *curr_event = last_event(task);
269 270

	/*
271 272 273
	 * optimize an existing RUN event by merging this one
	 * to it:
	 */
274
	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
275
		sched->nr_run_events_optimized++;
276 277 278 279 280 281 282 283 284
		curr_event->duration += duration;
		return;
	}

	event = get_new_event(task, timestamp);

	event->type = SCHED_EVENT_RUN;
	event->duration = duration;

285
	sched->nr_run_events++;
286 287
}

288 289
static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
				   u64 timestamp, struct task_desc *wakee)
290
{
291
	struct sched_atom *event, *wakee_event;
292 293 294 295 296 297 298

	event = get_new_event(task, timestamp);
	event->type = SCHED_EVENT_WAKEUP;
	event->wakee = wakee;

	wakee_event = last_event(wakee);
	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
299
		sched->targetless_wakeups++;
300 301 302
		return;
	}
	if (wakee_event->wait_sem) {
303
		sched->multitarget_wakeups++;
304 305 306
		return;
	}

307
	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
308 309 310 311
	sem_init(wakee_event->wait_sem, 0, 0);
	wakee_event->specific_wait = 1;
	event->wait_sem = wakee_event->wait_sem;

312
	sched->nr_wakeup_events++;
313 314
}

315 316
static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
				  u64 timestamp, u64 task_state __maybe_unused)
317
{
318
	struct sched_atom *event = get_new_event(task, timestamp);
319 320 321

	event->type = SCHED_EVENT_SLEEP;

322
	sched->nr_sleep_events++;
323 324
}

325 326
static struct task_desc *register_pid(struct perf_sched *sched,
				      unsigned long pid, const char *comm)
327 328 329 330 331
{
	struct task_desc *task;

	BUG_ON(pid >= MAX_PID);

332
	task = sched->pid_to_task[pid];
333 334 335 336

	if (task)
		return task;

337
	task = zalloc(sizeof(*task));
338
	task->pid = pid;
339
	task->nr = sched->nr_tasks;
340 341 342 343 344
	strcpy(task->comm, comm);
	/*
	 * every task starts in sleeping state - this gets ignored
	 * if there's no wakeup pointing to this sleep state:
	 */
345
	add_sched_event_sleep(sched, task, 0, 0);
346

347 348 349 350 351
	sched->pid_to_task[pid] = task;
	sched->nr_tasks++;
	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_task *));
	BUG_ON(!sched->tasks);
	sched->tasks[task->nr] = task;
352

353
	if (verbose)
354
		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
355 356 357 358 359

	return task;
}


360
static void print_task_traces(struct perf_sched *sched)
361 362 363 364
{
	struct task_desc *task;
	unsigned long i;

365 366
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
367
		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
368 369 370 371
			task->nr, task->comm, task->pid, task->nr_events);
	}
}

372
static void add_cross_task_wakeups(struct perf_sched *sched)
373 374 375 376
{
	struct task_desc *task1, *task2;
	unsigned long i, j;

377 378
	for (i = 0; i < sched->nr_tasks; i++) {
		task1 = sched->tasks[i];
379
		j = i + 1;
380
		if (j == sched->nr_tasks)
381
			j = 0;
382 383
		task2 = sched->tasks[j];
		add_sched_event_wakeup(sched, task1, 0, task2);
384 385 386
	}
}

387 388
static void perf_sched__process_event(struct perf_sched *sched,
				      struct sched_atom *atom)
389 390 391
{
	int ret = 0;

392
	switch (atom->type) {
393
		case SCHED_EVENT_RUN:
394
			burn_nsecs(sched, atom->duration);
395 396
			break;
		case SCHED_EVENT_SLEEP:
397 398
			if (atom->wait_sem)
				ret = sem_wait(atom->wait_sem);
399 400 401
			BUG_ON(ret);
			break;
		case SCHED_EVENT_WAKEUP:
402 403
			if (atom->wait_sem)
				ret = sem_post(atom->wait_sem);
404 405
			BUG_ON(ret);
			break;
406 407
		case SCHED_EVENT_MIGRATION:
			break;
408 409 410 411 412
		default:
			BUG_ON(1);
	}
}

413
static u64 get_cpu_usage_nsec_parent(void)
414 415
{
	struct rusage ru;
416
	u64 sum;
417 418 419 420 421 422 423 424 425 426 427
	int err;

	err = getrusage(RUSAGE_SELF, &ru);
	BUG_ON(err);

	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;

	return sum;
}

428
static int self_open_counters(void)
429
{
430 431
	struct perf_event_attr attr;
	int fd;
432

433
	memset(&attr, 0, sizeof(attr));
434

435 436
	attr.type = PERF_TYPE_SOFTWARE;
	attr.config = PERF_COUNT_SW_TASK_CLOCK;
437

438 439 440
	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);

	if (fd < 0)
441 442
		pr_err("Error: sys_perf_event_open() syscall returned "
		       "with %d (%s)\n", fd, strerror(errno));
443 444 445 446 447 448 449 450 451 452 453 454
	return fd;
}

static u64 get_cpu_usage_nsec_self(int fd)
{
	u64 runtime;
	int ret;

	ret = read(fd, &runtime, sizeof(runtime));
	BUG_ON(ret != sizeof(runtime));

	return runtime;
455 456
}

457 458 459 460 461
struct sched_thread_parms {
	struct task_desc  *task;
	struct perf_sched *sched;
};

462 463
static void *thread_func(void *ctx)
{
464 465 466
	struct sched_thread_parms *parms = ctx;
	struct task_desc *this_task = parms->task;
	struct perf_sched *sched = parms->sched;
467
	u64 cpu_usage_0, cpu_usage_1;
468 469
	unsigned long i, ret;
	char comm2[22];
470
	int fd;
471

472 473
	free(parms);

474 475
	sprintf(comm2, ":%s", this_task->comm);
	prctl(PR_SET_NAME, comm2);
476
	fd = self_open_counters();
477 478
	if (fd < 0)
		return NULL;
479 480 481
again:
	ret = sem_post(&this_task->ready_for_work);
	BUG_ON(ret);
482
	ret = pthread_mutex_lock(&sched->start_work_mutex);
483
	BUG_ON(ret);
484
	ret = pthread_mutex_unlock(&sched->start_work_mutex);
485 486
	BUG_ON(ret);

487
	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
488 489 490

	for (i = 0; i < this_task->nr_events; i++) {
		this_task->curr_event = i;
491
		perf_sched__process_event(sched, this_task->atoms[i]);
492 493
	}

494
	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
495 496 497 498
	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
	ret = sem_post(&this_task->work_done_sem);
	BUG_ON(ret);

499
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
500
	BUG_ON(ret);
501
	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
502 503 504 505 506
	BUG_ON(ret);

	goto again;
}

507
static void create_tasks(struct perf_sched *sched)
508 509 510 511 512 513 514 515
{
	struct task_desc *task;
	pthread_attr_t attr;
	unsigned long i;
	int err;

	err = pthread_attr_init(&attr);
	BUG_ON(err);
516 517
	err = pthread_attr_setstacksize(&attr,
			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
518
	BUG_ON(err);
519
	err = pthread_mutex_lock(&sched->start_work_mutex);
520
	BUG_ON(err);
521
	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
522
	BUG_ON(err);
523 524 525 526 527
	for (i = 0; i < sched->nr_tasks; i++) {
		struct sched_thread_parms *parms = malloc(sizeof(*parms));
		BUG_ON(parms == NULL);
		parms->task = task = sched->tasks[i];
		parms->sched = sched;
528 529 530 531
		sem_init(&task->sleep_sem, 0, 0);
		sem_init(&task->ready_for_work, 0, 0);
		sem_init(&task->work_done_sem, 0, 0);
		task->curr_event = 0;
532
		err = pthread_create(&task->thread, &attr, thread_func, parms);
533 534 535 536
		BUG_ON(err);
	}
}

537
static void wait_for_tasks(struct perf_sched *sched)
538
{
539
	u64 cpu_usage_0, cpu_usage_1;
540 541 542
	struct task_desc *task;
	unsigned long i, ret;

543 544 545
	sched->start_time = get_nsecs();
	sched->cpu_usage = 0;
	pthread_mutex_unlock(&sched->work_done_wait_mutex);
546

547 548
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
549 550 551 552
		ret = sem_wait(&task->ready_for_work);
		BUG_ON(ret);
		sem_init(&task->ready_for_work, 0, 0);
	}
553
	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
554 555 556 557
	BUG_ON(ret);

	cpu_usage_0 = get_cpu_usage_nsec_parent();

558
	pthread_mutex_unlock(&sched->start_work_mutex);
559

560 561
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
562 563 564
		ret = sem_wait(&task->work_done_sem);
		BUG_ON(ret);
		sem_init(&task->work_done_sem, 0, 0);
565
		sched->cpu_usage += task->cpu_usage;
566 567 568 569
		task->cpu_usage = 0;
	}

	cpu_usage_1 = get_cpu_usage_nsec_parent();
570 571 572
	if (!sched->runavg_cpu_usage)
		sched->runavg_cpu_usage = sched->cpu_usage;
	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10;
573

574 575 576 577 578
	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
	if (!sched->runavg_parent_cpu_usage)
		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 +
					 sched->parent_cpu_usage)/10;
579

580
	ret = pthread_mutex_lock(&sched->start_work_mutex);
581 582
	BUG_ON(ret);

583 584
	for (i = 0; i < sched->nr_tasks; i++) {
		task = sched->tasks[i];
585 586 587 588 589
		sem_init(&task->sleep_sem, 0, 0);
		task->curr_event = 0;
	}
}

590
static void run_one_test(struct perf_sched *sched)
591
{
592
	u64 T0, T1, delta, avg_delta, fluct;
593 594

	T0 = get_nsecs();
595
	wait_for_tasks(sched);
596 597 598
	T1 = get_nsecs();

	delta = T1 - T0;
599 600
	sched->sum_runtime += delta;
	sched->nr_runs++;
601

602
	avg_delta = sched->sum_runtime / sched->nr_runs;
603 604 605 606
	if (delta < avg_delta)
		fluct = avg_delta - delta;
	else
		fluct = delta - avg_delta;
607 608 609 610
	sched->sum_fluct += fluct;
	if (!sched->run_avg)
		sched->run_avg = delta;
	sched->run_avg = (sched->run_avg * 9 + delta) / 10;
611

612
	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
613

614
	printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
615

616
	printf("cpu: %0.2f / %0.2f",
617
		(double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
618 619 620

#if 0
	/*
621
	 * rusage statistics done by the parent, these are less
622
	 * accurate than the sched->sum_exec_runtime based statistics:
623
	 */
624
	printf(" [%0.2f / %0.2f]",
625 626
		(double)sched->parent_cpu_usage/1e6,
		(double)sched->runavg_parent_cpu_usage/1e6);
627 628
#endif

629
	printf("\n");
630

631 632 633
	if (sched->nr_sleep_corrections)
		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
	sched->nr_sleep_corrections = 0;
634 635
}

636
static void test_calibrations(struct perf_sched *sched)
637
{
638
	u64 T0, T1;
639 640

	T0 = get_nsecs();
641
	burn_nsecs(sched, 1e6);
642 643
	T1 = get_nsecs();

644
	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
645 646 647 648 649

	T0 = get_nsecs();
	sleep_nsecs(1e6);
	T1 = get_nsecs();

650
	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
651 652
}

653
static int
654
replay_wakeup_event(struct perf_sched *sched,
655 656
		    struct perf_evsel *evsel, struct perf_sample *sample,
		    struct machine *machine __maybe_unused)
657
{
658 659
	const char *comm = perf_evsel__strval(evsel, sample, "comm");
	const u32 pid	 = perf_evsel__intval(evsel, sample, "pid");
660
	struct task_desc *waker, *wakee;
661

662
	if (verbose) {
663
		printf("sched_wakeup event %p\n", evsel);
664

665
		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
666
	}
667

668
	waker = register_pid(sched, sample->tid, "<unknown>");
669
	wakee = register_pid(sched, pid, comm);
670

671
	add_sched_event_wakeup(sched, waker, sample->time, wakee);
672
	return 0;
673 674
}

675 676 677 678
static int replay_switch_event(struct perf_sched *sched,
			       struct perf_evsel *evsel,
			       struct perf_sample *sample,
			       struct machine *machine __maybe_unused)
679
{
680 681 682 683 684
	const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
		   *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
685
	struct task_desc *prev, __maybe_unused *next;
686 687
	u64 timestamp0, timestamp = sample->time;
	int cpu = sample->cpu;
688 689
	s64 delta;

690
	if (verbose)
691
		printf("sched_switch event %p\n", evsel);
692

693
	if (cpu >= MAX_CPUS || cpu < 0)
694
		return 0;
695

696
	timestamp0 = sched->cpu_last_switched[cpu];
697 698 699 700 701
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

702
	if (delta < 0) {
703
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
704 705
		return -1;
	}
706

707 708
	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
		 prev_comm, prev_pid, next_comm, next_pid, delta);
709

710 711
	prev = register_pid(sched, prev_pid, prev_comm);
	next = register_pid(sched, next_pid, next_comm);
712

713
	sched->cpu_last_switched[cpu] = timestamp;
714

715
	add_sched_event_run(sched, prev, timestamp, delta);
716
	add_sched_event_sleep(sched, prev, timestamp, prev_state);
717 718

	return 0;
719 720
}

721 722
static int replay_fork_event(struct perf_sched *sched, struct perf_evsel *evsel,
			     struct perf_sample *sample)
723
{
724 725 726 727 728
	const char *parent_comm = perf_evsel__strval(evsel, sample, "parent_comm"),
		   *child_comm  = perf_evsel__strval(evsel, sample, "child_comm");
	const u32 parent_pid  = perf_evsel__intval(evsel, sample, "parent_pid"),
		  child_pid  = perf_evsel__intval(evsel, sample, "child_pid");

729
	if (verbose) {
730
		printf("sched_fork event %p\n", evsel);
731 732
		printf("... parent: %s/%d\n", parent_comm, parent_pid);
		printf("...  child: %s/%d\n", child_comm, child_pid);
733
	}
734 735 736

	register_pid(sched, parent_pid, parent_comm);
	register_pid(sched, child_pid, child_comm);
737
	return 0;
738
}
739

740 741
struct sort_dimension {
	const char		*name;
742
	sort_fn_t		cmp;
743 744 745
	struct list_head	list;
};

746
static int
747
thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
748 749 750 751
{
	struct sort_dimension *sort;
	int ret = 0;

752 753
	BUG_ON(list_empty(list));

754 755 756 757 758 759 760 761 762
	list_for_each_entry(sort, list, list) {
		ret = sort->cmp(l, r);
		if (ret)
			return ret;
	}

	return ret;
}

763
static struct work_atoms *
764 765 766 767
thread_atoms_search(struct rb_root *root, struct thread *thread,
			 struct list_head *sort_list)
{
	struct rb_node *node = root->rb_node;
768
	struct work_atoms key = { .thread = thread };
769 770

	while (node) {
771
		struct work_atoms *atoms;
772 773
		int cmp;

774
		atoms = container_of(node, struct work_atoms, node);
775 776 777 778 779 780 781 782 783 784 785 786 787 788

		cmp = thread_lat_cmp(sort_list, &key, atoms);
		if (cmp > 0)
			node = node->rb_left;
		else if (cmp < 0)
			node = node->rb_right;
		else {
			BUG_ON(thread != atoms->thread);
			return atoms;
		}
	}
	return NULL;
}

789
static void
790
__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
791
			 struct list_head *sort_list)
792 793 794 795
{
	struct rb_node **new = &(root->rb_node), *parent = NULL;

	while (*new) {
796
		struct work_atoms *this;
797
		int cmp;
798

799
		this = container_of(*new, struct work_atoms, node);
800
		parent = *new;
801 802 803 804

		cmp = thread_lat_cmp(sort_list, data, this);

		if (cmp > 0)
805 806
			new = &((*new)->rb_left);
		else
807
			new = &((*new)->rb_right);
808 809 810 811 812 813
	}

	rb_link_node(&data->node, parent, new);
	rb_insert_color(&data->node, root);
}

814
static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
815
{
816
	struct work_atoms *atoms = zalloc(sizeof(*atoms));
817 818 819 820
	if (!atoms) {
		pr_err("No memory at %s\n", __func__);
		return -1;
	}
821

822
	atoms->thread = thread;
823
	INIT_LIST_HEAD(&atoms->work_list);
824
	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
825
	return 0;
826 827
}

828
static int latency_fork_event(struct perf_sched *sched __maybe_unused,
829 830
			      struct perf_evsel *evsel __maybe_unused,
			      struct perf_sample *sample __maybe_unused)
831 832
{
	/* should insert the newcomer */
833
	return 0;
834 835
}

836
static char sched_out_state(u64 prev_state)
837 838 839
{
	const char *str = TASK_STATE_TO_CHAR_STR;

840
	return str[prev_state];
841 842
}

843
static int
844 845 846
add_sched_out_event(struct work_atoms *atoms,
		    char run_state,
		    u64 timestamp)
847
{
848
	struct work_atom *atom = zalloc(sizeof(*atom));
849 850 851 852
	if (!atom) {
		pr_err("Non memory at %s", __func__);
		return -1;
	}
853

854 855
	atom->sched_out_time = timestamp;

856
	if (run_state == 'R') {
857
		atom->state = THREAD_WAIT_CPU;
858
		atom->wake_up_time = atom->sched_out_time;
859 860
	}

861
	list_add_tail(&atom->list, &atoms->work_list);
862
	return 0;
863 864 865
}

static void
866 867
add_runtime_event(struct work_atoms *atoms, u64 delta,
		  u64 timestamp __maybe_unused)
868 869 870 871 872 873 874 875 876 877 878 879 880
{
	struct work_atom *atom;

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);

	atom->runtime += delta;
	atoms->total_runtime += delta;
}

static void
add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
881
{
882
	struct work_atom *atom;
883
	u64 delta;
884

885
	if (list_empty(&atoms->work_list))
886 887
		return;

888
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
889

890
	if (atom->state != THREAD_WAIT_CPU)
891 892
		return;

893 894
	if (timestamp < atom->wake_up_time) {
		atom->state = THREAD_IGNORE;
895 896 897
		return;
	}

898 899
	atom->state = THREAD_SCHED_IN;
	atom->sched_in_time = timestamp;
900

901
	delta = atom->sched_in_time - atom->wake_up_time;
902
	atoms->total_lat += delta;
903
	if (delta > atoms->max_lat) {
904
		atoms->max_lat = delta;
905 906
		atoms->max_lat_at = timestamp;
	}
907
	atoms->nb_atoms++;
908 909
}

910 911 912 913
static int latency_switch_event(struct perf_sched *sched,
				struct perf_evsel *evsel,
				struct perf_sample *sample,
				struct machine *machine)
914
{
915 916 917
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
918
	struct work_atoms *out_events, *in_events;
919
	struct thread *sched_out, *sched_in;
920 921
	u64 timestamp0, timestamp = sample->time;
	int cpu = sample->cpu;
Ingo Molnar's avatar
Ingo Molnar committed
922 923
	s64 delta;

924
	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
Ingo Molnar's avatar
Ingo Molnar committed
925

926 927
	timestamp0 = sched->cpu_last_switched[cpu];
	sched->cpu_last_switched[cpu] = timestamp;
Ingo Molnar's avatar
Ingo Molnar committed
928 929 930 931 932
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

933 934 935 936
	if (delta < 0) {
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
		return -1;
	}
937

938 939
	sched_out = machine__findnew_thread(machine, prev_pid);
	sched_in = machine__findnew_thread(machine, next_pid);
940

941
	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
942
	if (!out_events) {
943
		if (thread_atoms_insert(sched, sched_out))
944
			return -1;
945
		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
946 947 948 949
		if (!out_events) {
			pr_err("out-event: Internal tree error");
			return -1;
		}
950
	}
951
	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
952
		return -1;
953

954
	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
955
	if (!in_events) {
956
		if (thread_atoms_insert(sched, sched_in))
957
			return -1;
958
		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
959 960 961 962
		if (!in_events) {
			pr_err("in-event: Internal tree error");
			return -1;
		}
963 964 965 966
		/*
		 * Take came in we have not heard about yet,
		 * add in an initial atom in runnable state:
		 */
967 968
		if (add_sched_out_event(in_events, 'R', timestamp))
			return -1;
969
	}
970
	add_sched_in_event(in_events, timestamp);
971 972

	return 0;
973
}
974

975 976 977 978
static int latency_runtime_event(struct perf_sched *sched,
				 struct perf_evsel *evsel,
				 struct perf_sample *sample,
				 struct machine *machine)
979
{
980 981 982
	const u32 pid	   = perf_evsel__intval(evsel, sample, "pid");
	const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
	struct thread *thread = machine__findnew_thread(machine, pid);
983
	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
984 985
	u64 timestamp = sample->time;
	int cpu = sample->cpu;
986 987 988

	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
	if (!atoms) {
989
		if (thread_atoms_insert(sched, thread))
990
			return -1;
991
		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
992
		if (!atoms) {
993
			pr_err("in-event: Internal tree error");
994 995 996 997
			return -1;
		}
		if (add_sched_out_event(atoms, 'R', timestamp))
			return -1;
998 999
	}

1000
	add_runtime_event(atoms, runtime, timestamp);
1001
	return 0;
1002 1003
}

1004 1005 1006 1007
static int latency_wakeup_event(struct perf_sched *sched,
				struct perf_evsel *evsel,
				struct perf_sample *sample,
				struct machine *machine)
1008
{
1009 1010
	const u32 pid	  = perf_evsel__intval(evsel, sample, "pid"),
		  success = perf_evsel__intval(evsel, sample, "success");
1011
	struct work_atoms *atoms;
1012
	struct work_atom *atom;
1013
	struct thread *wakee;
1014
	u64 timestamp = sample->time;
1015 1016

	/* Note for later, it may be interesting to observe the failing cases */
1017
	if (!success)
1018
		return 0;
1019

1020
	wakee = machine__findnew_thread(machine, pid);
1021
	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1022
	if (!atoms) {
1023
		if (thread_atoms_insert(sched, wakee))
1024
			return -1;
1025
		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1026
		if (!atoms) {
1027
			pr_err("wakeup-event: Internal tree error");
1028 1029 1030 1031
			return -1;
		}
		if (add_sched_out_event(atoms, 'S', timestamp))
			return -1;
1032 1033
	}

1034
	BUG_ON(list_empty(&atoms->work_list));
1035

1036
	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1037

1038 1039 1040 1041 1042
	/*
	 * You WILL be missing events if you've recorded only
	 * one CPU, or are only looking at only one, so don't
	 * make useless noise.
	 */
1043 1044
	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
		sched->nr_state_machine_bugs++;
1045

1046
	sched->nr_timestamps++;
1047
	if (atom->sched_out_time > timestamp) {
1048
		sched->nr_unordered_timestamps++;
1049
		return 0;
1050
	}
1051

1052 1053
	atom->state = THREAD_WAIT_CPU;
	atom->wake_up_time = timestamp;
1054
	return 0;
1055 1056
}

1057 1058 1059 1060
static int latency_migrate_task_event(struct perf_sched *sched,
				      struct perf_evsel *evsel,
				      struct perf_sample *sample,
				      struct machine *machine)
1061
{
1062
	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1063
	u64 timestamp = sample->time;
1064 1065 1066 1067 1068 1069 1070
	struct work_atoms *atoms;
	struct work_atom *atom;
	struct thread *migrant;

	/*
	 * Only need to worry about migration when profiling one CPU.
	 */
1071
	if (sched->profile_cpu == -1)
1072
		return 0;
1073

1074
	migrant = machine__findnew_thread(machine, pid);
1075
	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1076
	if (!atoms) {
1077
		if (thread_atoms_insert(sched, migrant))
1078
			return -1;
1079 1080
		register_pid(sched, migrant->pid, migrant->comm);
		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1081
		if (!atoms) {
1082
			pr_err("migration-event: Internal tree error");
1083 1084 1085 1086
			return -1;
		}
		if (add_sched_out_event(atoms, 'R', timestamp))
			return -1;
1087 1088 1089 1090 1091 1092 1093
	}

	BUG_ON(list_empty(&atoms->work_list));

	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;

1094
	sched->nr_timestamps++;
1095 1096

	if (atom->sched_out_time > timestamp)
1097
		sched->nr_unordered_timestamps++;
1098 1099

	return 0;
1100 1101
}

1102
static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1103 1104 1105
{
	int i;
	int ret;
1106
	u64 avg;
1107

1108
	if (!work_list->nb_atoms)
1109
		return;
1110 1111 1112
	/*
	 * Ignore idle threads:
	 */
1113
	if (!strcmp(work_list->thread->comm, "swapper"))
1114
		return;
1115

1116 1117
	sched->all_runtime += work_list->total_runtime;
	sched->all_count   += work_list->nb_atoms;
1118

1119
	ret = printf("  %s:%d ", work_list->thread->comm, work_list->thread->pid);
1120

mingo's avatar
mingo committed
1121
	for (i = 0; i < 24 - ret; i++)
1122 1123
		printf(" ");

1124
	avg = work_list->total_lat / work_list->nb_atoms;
1125

1126
	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
1127 1128
	      (double)work_list->total_runtime / 1e6,
		 work_list->nb_atoms, (double)avg / 1e6,
1129 1130
		 (double)work_list->max_lat / 1e6,
		 (double)work_list->max_lat_at / 1e9);
1131 1132
}

1133
static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1134 1135 1136 1137 1138 1139 1140 1141 1142
{
	if (l->thread->pid < r->thread->pid)
		return -1;
	if (l->thread->pid > r->thread->pid)
		return 1;

	return 0;
}

1143
static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163
{
	u64 avgl, avgr;

	if (!l->nb_atoms)
		return -1;

	if (!r->nb_atoms)
		return 1;

	avgl = l->total_lat / l->nb_atoms;
	avgr = r->total_lat / r->nb_atoms;

	if (avgl < avgr)
		return -1;
	if (avgl > avgr)
		return 1;

	return 0;
}

1164
static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1165 1166 1167 1168 1169 1170 1171 1172 1173
{
	if (l->max_lat < r->max_lat)
		return -1;
	if (l->max_lat > r->max_lat)
		return 1;

	return 0;
}

1174
static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1175 1176 1177 1178 1179 1180 1181 1182 1183
{
	if (l->nb_atoms < r->nb_atoms)
		return -1;
	if (l->nb_atoms > r->nb_atoms)
		return 1;

	return 0;
}

1184
static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1185 1186 1187 1188 1189 1190 1191 1192 1193
{
	if (l->total_runtime < r->total_runtime)
		return -1;
	if (l->total_runtime > r->total_runtime)
		return 1;

	return 0;
}

1194
static int sort_dimension__add(const char *tok, struct list_head *list)
1195
{
1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223
	size_t i;
	static struct sort_dimension avg_sort_dimension = {
		.name = "avg",
		.cmp  = avg_cmp,
	};
	static struct sort_dimension max_sort_dimension = {
		.name = "max",
		.cmp  = max_cmp,
	};
	static struct sort_dimension pid_sort_dimension = {
		.name = "pid",
		.cmp  = pid_cmp,
	};
	static struct sort_dimension runtime_sort_dimension = {
		.name = "runtime",
		.cmp  = runtime_cmp,
	};
	static struct sort_dimension switch_sort_dimension = {
		.name = "switch",
		.cmp  = switch_cmp,
	};
	struct sort_dimension *available_sorts[] = {
		&pid_sort_dimension,
		&avg_sort_dimension,
		&max_sort_dimension,
		&switch_sort_dimension,
		&runtime_sort_dimension,
	};
1224

1225
	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1226 1227 1228 1229 1230 1231 1232 1233 1234 1235
		if (!strcmp(available_sorts[i]->name, tok)) {
			list_add_tail(&available_sorts[i]->list, list);

			return 0;
		}
	}

	return -1;
}

1236
static void perf_sched__sort_lat(struct perf_sched *sched)
1237 1238 1239 1240
{
	struct rb_node *node;

	for (;;) {
1241
		struct work_atoms *data;
1242
		node = rb_first(&sched->atom_root);
1243 1244 1245
		if (!node)
			break;

1246
		rb_erase(node, &sched->atom_root);
1247
		data = rb_entry(node, struct work_atoms, node);
1248
		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1249 1250 1251
	}
}

1252
static int process_sched_wakeup_event(struct perf_tool *tool,
1253
				      struct perf_evsel *evsel,
1254
				      struct perf_sample *sample,
1255
				      struct machine *machine)
1256
{
1257
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1258

1259 1260
	if (sched->tp_handler->wakeup_event)
		return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1261

1262
	return 0;
1263 1264
}

1265 1266
static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
			    struct perf_sample *sample, struct machine *machine)
1267
{
1268 1269
	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1270
	struct thread *sched_out __maybe_unused, *sched_in;
1271
	int new_shortname;
1272
	u64 timestamp0, timestamp = sample->time;
1273
	s64 delta;
1274
	int cpu, this_cpu = sample->cpu;
1275 1276 1277

	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);

1278 1279
	if (this_cpu > sched->max_cpu)
		sched->max_cpu = this_cpu;
1280

1281 1282
	timestamp0 = sched->cpu_last_switched[this_cpu];
	sched->cpu_last_switched[this_cpu] = timestamp;
1283 1284 1285 1286 1287
	if (timestamp0)
		delta = timestamp - timestamp0;
	else
		delta = 0;

1288
	if (delta < 0) {
1289
		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1290 1291
		return -1;
	}
1292

1293 1294
	sched_out = machine__findnew_thread(machine, prev_pid);
	sched_in = machine__findnew_thread(machine, next_pid);
1295

1296
	sched->curr_thread[this_cpu] = sched_in;
1297 1298 1299 1300 1301

	printf("  ");

	new_shortname = 0;
	if (!sched_in->shortname[0]) {
1302 1303
		sched_in->shortname[0] = sched->next_shortname1;
		sched_in->shortname[1] = sched->next_shortname2;
1304

1305 1306
		if (sched->next_shortname1 < 'Z') {
			sched->next_shortname1++;
1307
		} else {
1308 1309 1310
			sched->next_shortname1='A';
			if (sched->next_shortname2 < '9') {
				sched->next_shortname2++;
1311
			} else {
1312
				sched->next_shortname2='0';
1313 1314 1315 1316 1317
			}
		}
		new_shortname = 1;
	}

1318
	for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
1319 1320 1321 1322 1323
		if (cpu != this_cpu)
			printf(" ");
		else
			printf("*");

1324 1325 1326
		if (sched->curr_thread[cpu]) {
			if (sched->curr_thread[cpu]->pid)
				printf("%2s ", sched->curr_thread[cpu]->shortname);
1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339
			else
				printf(".  ");
		} else
			printf("   ");
	}

	printf("  %12.6f secs ", (double)timestamp/1e9);
	if (new_shortname) {
		printf("%s => %s:%d\n",
			sched_in->shortname, sched_in->comm, sched_in->pid);
	} else {
		printf("\n");
	}
1340 1341

	return 0;
1342 1343
}

1344
static int process_sched_switch_event(struct perf_tool *tool,
1345
				      struct perf_evsel *evsel,
1346
				      struct perf_sample *sample,
1347
				      struct machine *machine)
1348
{
1349
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1350
	int this_cpu = sample->cpu, err = 0;
1351 1352
	u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
	    next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1353

1354
	if (sched->curr_pid[this_cpu] != (u32)-1) {
1355 1356 1357 1358
		/*
		 * Are we trying to switch away a PID that is
		 * not current?
		 */
1359
		if (sched->curr_pid[this_cpu] != prev_pid)
1360
			sched->nr_context_switch_bugs++;
1361 1362
	}

1363 1364
	if (sched->tp_handler->switch_event)
		err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1365 1366

	sched->curr_pid[this_cpu] = next_pid;
1367
	return err;
1368 1369
}

1370
static int process_sched_runtime_event(struct perf_tool *tool,
1371
				       struct perf_evsel *evsel,
1372
				       struct perf_sample *sample,
1373
				       struct machine *machine)
1374
{
1375
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1376

1377 1378
	if (sched->tp_handler->runtime_event)
		return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1379

1380
	return 0;
1381 1382
}

1383
static int process_sched_fork_event(struct perf_tool *tool,
1384
				    struct perf_evsel *evsel,
1385
				    struct perf_sample *sample,
1386
				    struct machine *machine __maybe_unused)
1387
{
1388
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1389

1390 1391
	if (sched->tp_handler->fork_event)
		return sched->tp_handler->fork_event(sched, evsel, sample);
1392

1393
	return 0;
1394 1395
}

1396
static int process_sched_exit_event(struct perf_tool *tool __maybe_unused,
1397
				    struct perf_evsel *evsel,
1398
				    struct perf_sample *sample __maybe_unused,
1399
				    struct machine *machine __maybe_unused)
1400
{
1401
	pr_debug("sched_exit event %p\n", evsel);
1402
	return 0;
1403 1404
}

1405
static int process_sched_migrate_task_event(struct perf_tool *tool,
1406
					    struct perf_evsel *evsel,
1407
					    struct perf_sample *sample,
1408
					    struct machine *machine)
1409
{
1410
	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1411

1412 1413
	if (sched->tp_handler->migrate_task_event)
		return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1414

1415
	return 0;
1416 1417
}

1418
typedef int (*tracepoint_handler)(struct perf_tool *tool,
1419
				  struct perf_evsel *evsel,
1420
				  struct perf_sample *sample,
1421
				  struct machine *machine);
1422

1423 1424
static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
						 union perf_event *event __maybe_unused,
1425 1426 1427
						 struct perf_sample *sample,
						 struct perf_evsel *evsel,
						 struct machine *machine)
Ingo Molnar's avatar
Ingo Molnar committed
1428
{
1429
	struct thread *thread = machine__findnew_thread(machine, sample->tid);
1430
	int err = 0;
Ingo Molnar's avatar
Ingo Molnar committed
1431 1432

	if (thread == NULL) {
1433
		pr_debug("problem processing %s event, skipping it.\n",
1434
			 perf_evsel__name(evsel));
Ingo Molnar's avatar
Ingo Molnar committed
1435 1436 1437
		return -1;
	}

1438 1439
	evsel->hists.stats.total_period += sample->period;
	hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
1440

1441 1442
	if (evsel->handler.func != NULL) {
		tracepoint_handler f = evsel->handler.func;
1443
		err = f(tool, evsel, sample, machine);
1444
	}
Ingo Molnar's avatar
Ingo Molnar committed
1445

1446
	return err;
Ingo Molnar's avatar
Ingo Molnar committed
1447 1448
}

1449 1450
static int perf_sched__read_events(struct perf_sched *sched, bool destroy,
				   struct perf_session **psession)
Ingo Molnar's avatar
Ingo Molnar committed
1451
{
1452 1453 1454 1455 1456 1457 1458 1459 1460
	const struct perf_evsel_str_handler handlers[] = {
		{ "sched:sched_switch",	      process_sched_switch_event, },
		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
		{ "sched:sched_process_fork", process_sched_fork_event, },
		{ "sched:sched_process_exit", process_sched_exit_event, },
		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
	};
1461 1462
	struct perf_session *session;

1463
	session = perf_session__new(sched->input_name, O_RDONLY, 0, false, &sched->tool);
1464 1465 1466 1467
	if (session == NULL) {
		pr_debug("No Memory for session\n");
		return -1;
	}
1468

1469 1470
	if (perf_session__set_tracepoints_handlers(session, handlers))
		goto out_delete;
1471

1472
	if (perf_session__has_traces(session, "record -R")) {
1473
		int err = perf_session__process_events(session, &sched->tool);
1474 1475 1476 1477
		if (err) {
			pr_err("Failed to process events, error %d", err);
			goto out_delete;
		}
1478

1479 1480 1481
		sched->nr_events      = session->hists.stats.nr_events[0];
		sched->nr_lost_events = session->hists.stats.total_lost;
		sched->nr_lost_chunks = session->hists.stats.nr_events[PERF_RECORD_LOST];
1482
	}
1483

1484 1485 1486 1487 1488
	if (destroy)
		perf_session__delete(session);

	if (psession)
		*psession = session;
1489 1490 1491 1492 1493 1494

	return 0;

out_delete:
	perf_session__delete(session);
	return -1;
Ingo Molnar's avatar
Ingo Molnar committed
1495 1496
}

1497
static void print_bad_events(struct perf_sched *sched)
1498
{
1499
	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
1500
		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1501 1502
			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
			sched->nr_unordered_timestamps, sched->nr_timestamps);
1503
	}
1504
	if (sched->nr_lost_events && sched->nr_events) {
1505
		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1506 1507
			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
1508
	}
1509
	if (sched->nr_state_machine_bugs && sched->nr_timestamps) {
1510
		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
1511 1512 1513
			(double)sched->nr_state_machine_bugs/(double)sched->nr_timestamps*100.0,
			sched->nr_state_machine_bugs, sched->nr_timestamps);
		if (sched->nr_lost_events)
1514 1515 1516
			printf(" (due to lost events?)");
		printf("\n");
	}
1517
	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
1518
		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
1519 1520 1521
			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
			sched->nr_context_switch_bugs, sched->nr_timestamps);
		if (sched->nr_lost_events)
1522 1523 1524 1525 1526
			printf(" (due to lost events?)");
		printf("\n");
	}
}

1527
static int perf_sched__lat(struct perf_sched *sched)
1528 1529
{
	struct rb_node *next;
1530
	struct perf_session *session;
1531 1532

	setup_pager();
1533
	if (perf_sched__read_events(sched, false, &session))
1534
		return -1;
1535
	perf_sched__sort_lat(sched);
1536

1537 1538 1539
	printf("\n ---------------------------------------------------------------------------------------------------------------\n");
	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at     |\n");
	printf(" ---------------------------------------------------------------------------------------------------------------\n");
1540

1541
	next = rb_first(&sched->sorted_atom_root);
1542 1543 1544 1545 1546

	while (next) {
		struct work_atoms *work_list;

		work_list = rb_entry(next, struct work_atoms, node);
1547
		output_lat_thread(sched, work_list);
1548 1549 1550 1551
		next = rb_next(next);
	}

	printf(" -----------------------------------------------------------------------------------------\n");
1552
	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
1553
		(double)sched->all_runtime / 1e6, sched->all_count);
1554 1555 1556

	printf(" ---------------------------------------------------\n");

1557
	print_bad_events(sched);
1558 1559
	printf("\n");

1560
	perf_session__delete(session);
1561
	return 0;
1562 1563
}

1564
static int perf_sched__map(struct perf_sched *sched)
1565
{
1566
	sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1567

1568
	setup_pager();
1569
	if (perf_sched__read_events(sched, true, NULL))
1570
		return -1;
1571
	print_bad_events(sched);
1572
	return 0;
1573 1574
}

1575
static int perf_sched__replay(struct perf_sched *sched)
1576 1577 1578
{
	unsigned long i;

1579 1580
	calibrate_run_measurement_overhead(sched);
	calibrate_sleep_measurement_overhead(sched);
1581

1582
	test_calibrations(sched);
1583

1584
	if (perf_sched__read_events(sched, true, NULL))
1585
		return -1;
1586

1587 1588 1589
	printf("nr_run_events:        %ld\n", sched->nr_run_events);
	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
1590

1591 1592 1593 1594 1595
	if (sched->targetless_wakeups)
		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
	if (sched->multitarget_wakeups)
		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
	if (sched->nr_run_events_optimized)
1596
		printf("run atoms optimized: %ld\n",
1597
			sched->nr_run_events_optimized);
1598

1599 1600
	print_task_traces(sched);
	add_cross_task_wakeups(sched);
1601

1602
	create_tasks(sched);
1603
	printf("------------------------------------------------------------\n");
1604 1605
	for (i = 0; i < sched->replay_repeat; i++)
		run_one_test(sched);
1606 1607

	return 0;
1608 1609
}

1610 1611
static void setup_sorting(struct perf_sched *sched, const struct option *options,
			  const char * const usage_msg[])
1612
{
1613
	char *tmp, *tok, *str = strdup(sched->sort_order);
1614 1615 1616

	for (tok = strtok_r(str, ", ", &tmp);
			tok; tok = strtok_r(NULL, ", ", &tmp)) {
1617
		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
1618
			error("Unknown --sort key: `%s'", tok);
1619
			usage_with_options(usage_msg, options);
1620 1621 1622 1623 1624
		}
	}

	free(str);

1625
	sort_dimension__add("pid", &sched->cmp_pid);
1626 1627
}

1628 1629 1630 1631
static int __cmd_record(int argc, const char **argv)
{
	unsigned int rec_argc, i, j;
	const char **rec_argv;
1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648
	const char * const record_args[] = {
		"record",
		"-a",
		"-R",
		"-f",
		"-m", "1024",
		"-c", "1",
		"-e", "sched:sched_switch",
		"-e", "sched:sched_stat_wait",
		"-e", "sched:sched_stat_sleep",
		"-e", "sched:sched_stat_iowait",
		"-e", "sched:sched_stat_runtime",
		"-e", "sched:sched_process_exit",
		"-e", "sched:sched_process_fork",
		"-e", "sched:sched_wakeup",
		"-e", "sched:sched_migrate_task",
	};
1649 1650 1651 1652

	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
	rec_argv = calloc(rec_argc + 1, sizeof(char *));

1653
	if (rec_argv == NULL)
1654 1655
		return -ENOMEM;

1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666
	for (i = 0; i < ARRAY_SIZE(record_args); i++)
		rec_argv[i] = strdup(record_args[i]);

	for (j = 1; j < (unsigned int)argc; j++, i++)
		rec_argv[i] = argv[j];

	BUG_ON(i != rec_argc);

	return cmd_record(i, rec_argv, NULL);
}

1667
int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
Ingo Molnar's avatar
Ingo Molnar committed
1668
{
1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745
	const char default_sort_order[] = "avg, max, switch, runtime";
	struct perf_sched sched = {
		.tool = {
			.sample		 = perf_sched__process_tracepoint_sample,
			.comm		 = perf_event__process_comm,
			.lost		 = perf_event__process_lost,
			.fork		 = perf_event__process_task,
			.ordered_samples = true,
		},
		.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
		.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
		.start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
		.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
		.curr_pid	      = { [0 ... MAX_CPUS - 1] = -1 },
		.sort_order	      = default_sort_order,
		.replay_repeat	      = 10,
		.profile_cpu	      = -1,
		.next_shortname1      = 'A',
		.next_shortname2      = '0',
	};
	const struct option latency_options[] = {
	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
		   "sort by key(s): runtime, switch, avg, max"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
		    "CPU to profile on"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const struct option replay_options[] = {
	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
		     "repeat the workload replay N times (-1: infinite)"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const struct option sched_options[] = {
	OPT_STRING('i', "input", &sched.input_name, "file",
		    "input file name"),
	OPT_INCR('v', "verbose", &verbose,
		    "be more verbose (show symbol address, etc)"),
	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
		    "dump raw trace in ASCII"),
	OPT_END()
	};
	const char * const latency_usage[] = {
		"perf sched latency [<options>]",
		NULL
	};
	const char * const replay_usage[] = {
		"perf sched replay [<options>]",
		NULL
	};
	const char * const sched_usage[] = {
		"perf sched [<options>] {record|latency|map|replay|script}",
		NULL
	};
	struct trace_sched_handler lat_ops  = {
		.wakeup_event	    = latency_wakeup_event,
		.switch_event	    = latency_switch_event,
		.runtime_event	    = latency_runtime_event,
		.fork_event	    = latency_fork_event,
		.migrate_task_event = latency_migrate_task_event,
	};
	struct trace_sched_handler map_ops  = {
		.switch_event	    = map_switch_event,
	};
	struct trace_sched_handler replay_ops  = {
		.wakeup_event	    = replay_wakeup_event,
		.switch_event	    = replay_switch_event,
		.fork_event	    = replay_fork_event,
	};

1746 1747 1748 1749
	argc = parse_options(argc, argv, sched_options, sched_usage,
			     PARSE_OPT_STOP_AT_NON_OPTION);
	if (!argc)
		usage_with_options(sched_usage, sched_options);
Ingo Molnar's avatar
Ingo Molnar committed
1750

1751
	/*
1752
	 * Aliased to 'perf script' for now:
1753
	 */
1754 1755
	if (!strcmp(argv[0], "script"))
		return cmd_script(argc, argv, prefix);
1756

1757
	symbol__init();
1758 1759 1760
	if (!strncmp(argv[0], "rec", 3)) {
		return __cmd_record(argc, argv);
	} else if (!strncmp(argv[0], "lat", 3)) {
1761
		sched.tp_handler = &lat_ops;
1762 1763 1764 1765 1766
		if (argc > 1) {
			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
			if (argc)
				usage_with_options(latency_usage, latency_options);
		}
1767 1768
		setup_sorting(&sched, latency_options, latency_usage);
		return perf_sched__lat(&sched);
1769
	} else if (!strcmp(argv[0], "map")) {
1770 1771 1772
		sched.tp_handler = &map_ops;
		setup_sorting(&sched, latency_options, latency_usage);
		return perf_sched__map(&sched);
1773
	} else if (!strncmp(argv[0], "rep", 3)) {
1774
		sched.tp_handler = &replay_ops;
1775 1776 1777 1778 1779
		if (argc) {
			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
			if (argc)
				usage_with_options(replay_usage, replay_options);
		}
1780
		return perf_sched__replay(&sched);
1781 1782 1783 1784
	} else {
		usage_with_options(sched_usage, sched_options);
	}

1785
	return 0;
Ingo Molnar's avatar
Ingo Molnar committed
1786
}