• Stanislaw Gruszka's avatar
    sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency · 6e998916
    Stanislaw Gruszka authored
    Commit d670ec13 "posix-cpu-timers: Cure SMP wobbles" fixes one glibc
    test case in cost of breaking another one. After that commit, calling
    clock_nanosleep(TIMER_ABSTIME, X) and then clock_gettime(&Y) can result
    of Y time being smaller than X time.
    
    Reproducer/tester can be found further below, it can be compiled and ran by:
    
    	gcc -o tst-cpuclock2 tst-cpuclock2.c -pthread
    	while ./tst-cpuclock2 ; do : ; done
    
    This reproducer, when running on a buggy kernel, will complain
    about "clock_gettime difference too small".
    
    Issue happens because on start in thread_group_cputimer() we initialize
    sum_exec_runtime of cputimer with threads runtime not yet accounted and
    then add the threads runtime to running cputimer again on scheduler
    tick, making it's sum_exec_runtime bigger than actual threads runtime.
    
    KOSAKI Motohiro posted a fix for this problem, but that patch was never
    applied: https://lkml.org/lkml/2013/5/26/191 .
    
    This patch takes different approach to cure the problem. It calls
    update_curr() when cputimer starts, that assure we will have updated
    stats of running threads and on the next schedule tick we will account
    only the runtime that elapsed from cputimer start. That also assure we
    have consistent state between cpu times of individual threads and cpu
    time of the process consisted by those threads.
    
    Full reproducer (tst-cpuclock2.c):
    
    	#define _GNU_SOURCE
    	#include <unistd.h>
    	#include <sys/syscall.h>
    	#include <stdio.h>
    	#include <time.h>
    	#include <pthread.h>
    	#include <stdint.h>
    	#include <inttypes.h>
    
    	/* Parameters for the Linux kernel ABI for CPU clocks.  */
    	#define CPUCLOCK_SCHED          2
    	#define MAKE_PROCESS_CPUCLOCK(pid, clock) \
    		((~(clockid_t) (pid) << 3) | (clockid_t) (clock))
    
    	static pthread_barrier_t barrier;
    
    	/* Help advance the clock.  */
    	static void *chew_cpu(void *arg)
    	{
    		pthread_barrier_wait(&barrier);
    		while (1) ;
    
    		return NULL;
    	}
    
    	/* Don't use the glibc wrapper.  */
    	static int do_nanosleep(int flags, const struct timespec *req)
    	{
    		clockid_t clock_id = MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED);
    
    		return syscall(SYS_clock_nanosleep, clock_id, flags, req, NULL);
    	}
    
    	static int64_t tsdiff(const struct timespec *before, const struct timespec *after)
    	{
    		int64_t before_i = before->tv_sec * 1000000000ULL + before->tv_nsec;
    		int64_t after_i = after->tv_sec * 1000000000ULL + after->tv_nsec;
    
    		return after_i - before_i;
    	}
    
    	int main(void)
    	{
    		int result = 0;
    		pthread_t th;
    
    		pthread_barrier_init(&barrier, NULL, 2);
    
    		if (pthread_create(&th, NULL, chew_cpu, NULL) != 0) {
    			perror("pthread_create");
    			return 1;
    		}
    
    		pthread_barrier_wait(&barrier);
    
    		/* The test.  */
    		struct timespec before, after, sleeptimeabs;
    		int64_t sleepdiff, diffabs;
    		const struct timespec sleeptime = {.tv_sec = 0,.tv_nsec = 100000000 };
    
    		/* The relative nanosleep.  Not sure why this is needed, but its presence
    		   seems to make it easier to reproduce the problem.  */
    		if (do_nanosleep(0, &sleeptime) != 0) {
    			perror("clock_nanosleep");
    			return 1;
    		}
    
    		/* Get the current time.  */
    		if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &before) < 0) {
    			perror("clock_gettime[2]");
    			return 1;
    		}
    
    		/* Compute the absolute sleep time based on the current time.  */
    		uint64_t nsec = before.tv_nsec + sleeptime.tv_nsec;
    		sleeptimeabs.tv_sec = before.tv_sec + nsec / 1000000000;
    		sleeptimeabs.tv_nsec = nsec % 1000000000;
    
    		/* Sleep for the computed time.  */
    		if (do_nanosleep(TIMER_ABSTIME, &sleeptimeabs) != 0) {
    			perror("absolute clock_nanosleep");
    			return 1;
    		}
    
    		/* Get the time after the sleep.  */
    		if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &after) < 0) {
    			perror("clock_gettime[3]");
    			return 1;
    		}
    
    		/* The time after sleep should always be equal to or after the absolute sleep
    		   time passed to clock_nanosleep.  */
    		sleepdiff = tsdiff(&sleeptimeabs, &after);
    		if (sleepdiff < 0) {
    			printf("absolute clock_nanosleep woke too early: %" PRId64 "\n", sleepdiff);
    			result = 1;
    
    			printf("Before %llu.%09llu\n", before.tv_sec, before.tv_nsec);
    			printf("After  %llu.%09llu\n", after.tv_sec, after.tv_nsec);
    			printf("Sleep  %llu.%09llu\n", sleeptimeabs.tv_sec, sleeptimeabs.tv_nsec);
    		}
    
    		/* The difference between the timestamps taken before and after the
    		   clock_nanosleep call should be equal to or more than the duration of the
    		   sleep.  */
    		diffabs = tsdiff(&before, &after);
    		if (diffabs < sleeptime.tv_nsec) {
    			printf("clock_gettime difference too small: %" PRId64 "\n", diffabs);
    			result = 1;
    		}
    
    		pthread_cancel(th);
    
    		return result;
    	}
    Signed-off-by: default avatarStanislaw Gruszka <sgruszka@redhat.com>
    Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
    Cc: Rik van Riel <riel@redhat.com>
    Cc: Frederic Weisbecker <fweisbec@gmail.com>
    Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
    Cc: Oleg Nesterov <oleg@redhat.com>
    Cc: Linus Torvalds <torvalds@linux-foundation.org>
    Link: http://lkml.kernel.org/r/20141112155843.GA24803@redhat.comSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
    6e998916
core.c 195 KB