Commit fff7fb0b authored by Zhaoxiu Zeng's avatar Zhaoxiu Zeng Committed by Linus Torvalds

lib/GCD.c: use binary GCD algorithm instead of Euclidean

The binary GCD algorithm is based on the following facts:
	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a fast
__ffs (find least significant set bit) instruction is available.  This
allows the unpredictable branches in the bit-at-a-time shifting loop to
be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b == 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == 1)
				return r & -r;
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == r)
				return r;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

	#if defined(__x86_64__)

	#define rdtscll(val) do { \
		unsigned long __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
	} while(0)

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long long ret;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		if (end >= start)
			ret = end - start;
		else
			ret = ~0ULL - start + 1 + end;

		*res = gcd_res;
		return ret;
	}

	#else

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
	}

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		struct timespec start, end;
		unsigned long gcd_res;

		start = read_time();
		gcd_res = gcd(a, b);
		end = read_time();

		*res = gcd_res;
		return diff_time(start, end);
	}

	#endif

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 10174
  gcd1: elapsed 2120
  gcd2: elapsed 2902
  gcd3: elapsed 2039
  gcd4: elapsed 2812
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9309
  gcd1: elapsed 2280
  gcd2: elapsed 2822
  gcd3: elapsed 2217
  gcd4: elapsed 2710
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9589
  gcd1: elapsed 2098
  gcd2: elapsed 2815
  gcd3: elapsed 2030
  gcd4: elapsed 2718
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9914
  gcd1: elapsed 2309
  gcd2: elapsed 2779
  gcd3: elapsed 2228
  gcd4: elapsed 2709
  PASS

[akpm@linux-foundation.org: avoid #defining a CONFIG_ variable]
Signed-off-by: default avatarZhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: default avatarGeorge Spelvin <linux@horizon.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 3bcadd6f
......@@ -647,4 +647,7 @@ config COMPAT_OLD_SIGACTION
config ARCH_NO_COHERENT_DMA_MMAP
bool
config CPU_NO_EFFICIENT_FFS
def_bool n
source "kernel/gcov/Kconfig"
......@@ -26,6 +26,7 @@ config ALPHA
select MODULES_USE_ELF_RELA
select ODD_RT_SIGACTION
select OLD_SIGSUSPEND
select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
help
The Alpha is a 64-bit general-purpose processor designed and
marketed by the Digital Equipment Corporation of blessed memory,
......
......@@ -107,6 +107,7 @@ choice
config ISA_ARCOMPACT
bool "ARCompact ISA"
select CPU_NO_EFFICIENT_FFS
help
The original ARC ISA of ARC600/700 cores
......
......@@ -421,18 +421,21 @@ config CPU_32v3
select CPU_USE_DOMAINS if MMU
select NEED_KUSER_HELPERS
select TLS_REG_EMUL if SMP || !MMU
select CPU_NO_EFFICIENT_FFS
config CPU_32v4
bool
select CPU_USE_DOMAINS if MMU
select NEED_KUSER_HELPERS
select TLS_REG_EMUL if SMP || !MMU
select CPU_NO_EFFICIENT_FFS
config CPU_32v4T
bool
select CPU_USE_DOMAINS if MMU
select NEED_KUSER_HELPERS
select TLS_REG_EMUL if SMP || !MMU
select CPU_NO_EFFICIENT_FFS
config CPU_32v5
bool
......
......@@ -20,6 +20,7 @@ config H8300
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZO
select HAVE_ARCH_KGDB
select CPU_NO_EFFICIENT_FFS
config RWSEM_GENERIC_SPINLOCK
def_bool y
......
......@@ -17,6 +17,7 @@ config M32R
select ARCH_USES_GETTIMEOFFSET
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
select CPU_NO_EFFICIENT_FFS
config SBUS
bool
......
......@@ -40,6 +40,7 @@ config M68000
select CPU_HAS_NO_MULDIV64
select CPU_HAS_NO_UNALIGNED
select GENERIC_CSUM
select CPU_NO_EFFICIENT_FFS
help
The Freescale (was Motorola) 68000 CPU is the first generation of
the well known M68K family of processors. The CPU core as well as
......@@ -51,6 +52,7 @@ config MCPU32
bool
select CPU_HAS_NO_BITFIELDS
select CPU_HAS_NO_UNALIGNED
select CPU_NO_EFFICIENT_FFS
help
The Freescale (was then Motorola) CPU32 is a CPU core that is
based on the 68020 processor. For the most part it is used in
......@@ -130,6 +132,7 @@ config M5206
depends on !MMU
select COLDFIRE_SW_A7
select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help
Motorola ColdFire 5206 processor support.
......@@ -138,6 +141,7 @@ config M5206e
depends on !MMU
select COLDFIRE_SW_A7
select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help
Motorola ColdFire 5206e processor support.
......@@ -163,6 +167,7 @@ config M5249
depends on !MMU
select COLDFIRE_SW_A7
select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help
Motorola ColdFire 5249 processor support.
......@@ -171,6 +176,7 @@ config M525x
depends on !MMU
select COLDFIRE_SW_A7
select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help
Freescale (Motorola) Coldfire 5251/5253 processor support.
......@@ -189,6 +195,7 @@ config M5272
depends on !MMU
select COLDFIRE_SW_A7
select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help
Motorola ColdFire 5272 processor support.
......@@ -217,6 +224,7 @@ config M5307
select COLDFIRE_SW_A7
select HAVE_CACHE_CB
select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help
Motorola ColdFire 5307 processor support.
......@@ -242,6 +250,7 @@ config M5407
select COLDFIRE_SW_A7
select HAVE_CACHE_CB
select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help
Motorola ColdFire 5407 processor support.
......@@ -251,6 +260,7 @@ config M547x
select MMU_COLDFIRE if MMU
select HAVE_CACHE_CB
select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help
Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
......@@ -260,6 +270,7 @@ config M548x
select M54xx
select HAVE_CACHE_CB
select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help
Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
......
......@@ -30,6 +30,7 @@ config METAG
select OF
select OF_EARLY_FLATTREE
select SPARSE_IRQ
select CPU_NO_EFFICIENT_FFS
config STACKTRACE_SUPPORT
def_bool y
......
......@@ -32,6 +32,7 @@ config MICROBLAZE
select OF_EARLY_FLATTREE
select TRACING_SUPPORT
select VIRT_TO_BUS
select CPU_NO_EFFICIENT_FFS
config SWAP
def_bool n
......
......@@ -204,6 +204,16 @@
#endif
#endif
/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
(defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
(defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
(defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
(defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
(defined(cpu_has_mips64r6) && cpu_has_mips64r6))
#define CPU_NO_EFFICIENT_FFS 1
#endif
#ifndef cpu_has_mips_1
# define cpu_has_mips_1 (!cpu_has_mips_r6)
#endif
......
......@@ -15,6 +15,7 @@ config NIOS2
select SOC_BUS
select SPARSE_IRQ
select USB_ARCH_HAS_HCD if USB_SUPPORT
select CPU_NO_EFFICIENT_FFS
config GENERIC_CSUM
def_bool y
......
......@@ -25,6 +25,7 @@ config OPENRISC
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
select OR1K_PIC
select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
config MMU
def_bool y
......
......@@ -32,6 +32,7 @@ config PARISC
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_SECCOMP_FILTER
select ARCH_NO_COHERENT_DMA_MMAP
select CPU_NO_EFFICIENT_FFS
help
The PA-RISC microprocessor is designed by Hewlett-Packard and used
......
......@@ -123,6 +123,7 @@ config S390
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_EARLY_PFN_TO_NID
select HAVE_ARCH_JUMP_LABEL
select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_SOFT_DIRTY
select HAVE_ARCH_TRACEHOOK
......
......@@ -14,6 +14,7 @@ config SCORE
select VIRT_TO_BUS
select MODULES_USE_ELF_REL
select CLONE_BACKWARDS
select CPU_NO_EFFICIENT_FFS
choice
prompt "System type"
......
......@@ -20,6 +20,7 @@ config SUPERH
select PERF_USE_VMALLOC
select HAVE_DEBUG_KMEMLEAK
select HAVE_KERNEL_GZIP
select CPU_NO_EFFICIENT_FFS
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
select HAVE_KERNEL_XZ
......
......@@ -42,6 +42,7 @@ config SPARC
select ODD_RT_SIGACTION
select OLD_SIGSUSPEND
select ARCH_HAS_SG_CHAIN
select CPU_NO_EFFICIENT_FFS
config SPARC32
def_bool !64BIT
......
......@@ -2,20 +2,77 @@
#include <linux/gcd.h>
#include <linux/export.h>
/* Greatest common divisor */
/*
* This implements the binary GCD algorithm. (Often attributed to Stein,
* but as Knuth has noted, appears in a first-century Chinese math text.)
*
* This is faster than the division-based algorithm even on x86, which
* has decent hardware division.
*/
#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS)
/* If __ffs is available, the even/odd algorithm benchmarks slower. */
unsigned long gcd(unsigned long a, unsigned long b)
{
unsigned long r;
unsigned long r = a | b;
if (!a || !b)
return r;
b >>= __ffs(b);
if (b == 1)
return r & -r;
for (;;) {
a >>= __ffs(a);
if (a == 1)
return r & -r;
if (a == b)
return a << __ffs(r);
if (a < b)
swap(a, b);
a -= b;
}
}
#else
if (!b)
/* If normalization is done by loops, the even/odd algorithm is a win. */
unsigned long gcd(unsigned long a, unsigned long b)
{
unsigned long r = a | b;
if (!a || !b)
return r;
/* Isolate lsbit of r */
r &= -r;
while (!(b & r))
b >>= 1;
if (b == r)
return r;
for (;;) {
while (!(a & r))
a >>= 1;
if (a == r)
return r;
if (a == b)
return a;
while ((r = a % b) != 0) {
a = b;
b = r;
if (a < b)
swap(a, b);
a -= b;
a >>= 1;
if (a & r)
a += b;
a >>= 1;
}
return b;
}
#endif
EXPORT_SYMBOL_GPL(gcd);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment