Commit fff7fb0b authored by Zhaoxiu Zeng's avatar Zhaoxiu Zeng Committed by Linus Torvalds

lib/GCD.c: use binary GCD algorithm instead of Euclidean

The binary GCD algorithm is based on the following facts:
	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a fast
__ffs (find least significant set bit) instruction is available.  This
allows the unpredictable branches in the bit-at-a-time shifting loop to
be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b == 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == 1)
				return r & -r;
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == r)
				return r;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

	#if defined(__x86_64__)

	#define rdtscll(val) do { \
		unsigned long __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
	} while(0)

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long long ret;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		if (end >= start)
			ret = end - start;
		else
			ret = ~0ULL - start + 1 + end;

		*res = gcd_res;
		return ret;
	}

	#else

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
	}

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		struct timespec start, end;
		unsigned long gcd_res;

		start = read_time();
		gcd_res = gcd(a, b);
		end = read_time();

		*res = gcd_res;
		return diff_time(start, end);
	}

	#endif

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 10174
  gcd1: elapsed 2120
  gcd2: elapsed 2902
  gcd3: elapsed 2039
  gcd4: elapsed 2812
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9309
  gcd1: elapsed 2280
  gcd2: elapsed 2822
  gcd3: elapsed 2217
  gcd4: elapsed 2710
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9589
  gcd1: elapsed 2098
  gcd2: elapsed 2815
  gcd3: elapsed 2030
  gcd4: elapsed 2718
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9914
  gcd1: elapsed 2309
  gcd2: elapsed 2779
  gcd3: elapsed 2228
  gcd4: elapsed 2709
  PASS

[akpm@linux-foundation.org: avoid #defining a CONFIG_ variable]
Signed-off-by: default avatarZhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: default avatarGeorge Spelvin <linux@horizon.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 3bcadd6f
...@@ -647,4 +647,7 @@ config COMPAT_OLD_SIGACTION ...@@ -647,4 +647,7 @@ config COMPAT_OLD_SIGACTION
config ARCH_NO_COHERENT_DMA_MMAP config ARCH_NO_COHERENT_DMA_MMAP
bool bool
config CPU_NO_EFFICIENT_FFS
def_bool n
source "kernel/gcov/Kconfig" source "kernel/gcov/Kconfig"
...@@ -26,6 +26,7 @@ config ALPHA ...@@ -26,6 +26,7 @@ config ALPHA
select MODULES_USE_ELF_RELA select MODULES_USE_ELF_RELA
select ODD_RT_SIGACTION select ODD_RT_SIGACTION
select OLD_SIGSUSPEND select OLD_SIGSUSPEND
select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
help help
The Alpha is a 64-bit general-purpose processor designed and The Alpha is a 64-bit general-purpose processor designed and
marketed by the Digital Equipment Corporation of blessed memory, marketed by the Digital Equipment Corporation of blessed memory,
......
...@@ -107,6 +107,7 @@ choice ...@@ -107,6 +107,7 @@ choice
config ISA_ARCOMPACT config ISA_ARCOMPACT
bool "ARCompact ISA" bool "ARCompact ISA"
select CPU_NO_EFFICIENT_FFS
help help
The original ARC ISA of ARC600/700 cores The original ARC ISA of ARC600/700 cores
......
...@@ -421,18 +421,21 @@ config CPU_32v3 ...@@ -421,18 +421,21 @@ config CPU_32v3
select CPU_USE_DOMAINS if MMU select CPU_USE_DOMAINS if MMU
select NEED_KUSER_HELPERS select NEED_KUSER_HELPERS
select TLS_REG_EMUL if SMP || !MMU select TLS_REG_EMUL if SMP || !MMU
select CPU_NO_EFFICIENT_FFS
config CPU_32v4 config CPU_32v4
bool bool
select CPU_USE_DOMAINS if MMU select CPU_USE_DOMAINS if MMU
select NEED_KUSER_HELPERS select NEED_KUSER_HELPERS
select TLS_REG_EMUL if SMP || !MMU select TLS_REG_EMUL if SMP || !MMU
select CPU_NO_EFFICIENT_FFS
config CPU_32v4T config CPU_32v4T
bool bool
select CPU_USE_DOMAINS if MMU select CPU_USE_DOMAINS if MMU
select NEED_KUSER_HELPERS select NEED_KUSER_HELPERS
select TLS_REG_EMUL if SMP || !MMU select TLS_REG_EMUL if SMP || !MMU
select CPU_NO_EFFICIENT_FFS
config CPU_32v5 config CPU_32v5
bool bool
......
...@@ -20,6 +20,7 @@ config H8300 ...@@ -20,6 +20,7 @@ config H8300
select HAVE_KERNEL_GZIP select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZO select HAVE_KERNEL_LZO
select HAVE_ARCH_KGDB select HAVE_ARCH_KGDB
select CPU_NO_EFFICIENT_FFS
config RWSEM_GENERIC_SPINLOCK config RWSEM_GENERIC_SPINLOCK
def_bool y def_bool y
......
...@@ -17,6 +17,7 @@ config M32R ...@@ -17,6 +17,7 @@ config M32R
select ARCH_USES_GETTIMEOFFSET select ARCH_USES_GETTIMEOFFSET
select MODULES_USE_ELF_RELA select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW select HAVE_DEBUG_STACKOVERFLOW
select CPU_NO_EFFICIENT_FFS
config SBUS config SBUS
bool bool
......
...@@ -40,6 +40,7 @@ config M68000 ...@@ -40,6 +40,7 @@ config M68000
select CPU_HAS_NO_MULDIV64 select CPU_HAS_NO_MULDIV64
select CPU_HAS_NO_UNALIGNED select CPU_HAS_NO_UNALIGNED
select GENERIC_CSUM select GENERIC_CSUM
select CPU_NO_EFFICIENT_FFS
help help
The Freescale (was Motorola) 68000 CPU is the first generation of The Freescale (was Motorola) 68000 CPU is the first generation of
the well known M68K family of processors. The CPU core as well as the well known M68K family of processors. The CPU core as well as
...@@ -51,6 +52,7 @@ config MCPU32 ...@@ -51,6 +52,7 @@ config MCPU32
bool bool
select CPU_HAS_NO_BITFIELDS select CPU_HAS_NO_BITFIELDS
select CPU_HAS_NO_UNALIGNED select CPU_HAS_NO_UNALIGNED
select CPU_NO_EFFICIENT_FFS
help help
The Freescale (was then Motorola) CPU32 is a CPU core that is The Freescale (was then Motorola) CPU32 is a CPU core that is
based on the 68020 processor. For the most part it is used in based on the 68020 processor. For the most part it is used in
...@@ -130,6 +132,7 @@ config M5206 ...@@ -130,6 +132,7 @@ config M5206
depends on !MMU depends on !MMU
select COLDFIRE_SW_A7 select COLDFIRE_SW_A7
select HAVE_MBAR select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help help
Motorola ColdFire 5206 processor support. Motorola ColdFire 5206 processor support.
...@@ -138,6 +141,7 @@ config M5206e ...@@ -138,6 +141,7 @@ config M5206e
depends on !MMU depends on !MMU
select COLDFIRE_SW_A7 select COLDFIRE_SW_A7
select HAVE_MBAR select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help help
Motorola ColdFire 5206e processor support. Motorola ColdFire 5206e processor support.
...@@ -163,6 +167,7 @@ config M5249 ...@@ -163,6 +167,7 @@ config M5249
depends on !MMU depends on !MMU
select COLDFIRE_SW_A7 select COLDFIRE_SW_A7
select HAVE_MBAR select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help help
Motorola ColdFire 5249 processor support. Motorola ColdFire 5249 processor support.
...@@ -171,6 +176,7 @@ config M525x ...@@ -171,6 +176,7 @@ config M525x
depends on !MMU depends on !MMU
select COLDFIRE_SW_A7 select COLDFIRE_SW_A7
select HAVE_MBAR select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help help
Freescale (Motorola) Coldfire 5251/5253 processor support. Freescale (Motorola) Coldfire 5251/5253 processor support.
...@@ -189,6 +195,7 @@ config M5272 ...@@ -189,6 +195,7 @@ config M5272
depends on !MMU depends on !MMU
select COLDFIRE_SW_A7 select COLDFIRE_SW_A7
select HAVE_MBAR select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help help
Motorola ColdFire 5272 processor support. Motorola ColdFire 5272 processor support.
...@@ -217,6 +224,7 @@ config M5307 ...@@ -217,6 +224,7 @@ config M5307
select COLDFIRE_SW_A7 select COLDFIRE_SW_A7
select HAVE_CACHE_CB select HAVE_CACHE_CB
select HAVE_MBAR select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help help
Motorola ColdFire 5307 processor support. Motorola ColdFire 5307 processor support.
...@@ -242,6 +250,7 @@ config M5407 ...@@ -242,6 +250,7 @@ config M5407
select COLDFIRE_SW_A7 select COLDFIRE_SW_A7
select HAVE_CACHE_CB select HAVE_CACHE_CB
select HAVE_MBAR select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help help
Motorola ColdFire 5407 processor support. Motorola ColdFire 5407 processor support.
...@@ -251,6 +260,7 @@ config M547x ...@@ -251,6 +260,7 @@ config M547x
select MMU_COLDFIRE if MMU select MMU_COLDFIRE if MMU
select HAVE_CACHE_CB select HAVE_CACHE_CB
select HAVE_MBAR select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help help
Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support. Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
...@@ -260,6 +270,7 @@ config M548x ...@@ -260,6 +270,7 @@ config M548x
select M54xx select M54xx
select HAVE_CACHE_CB select HAVE_CACHE_CB
select HAVE_MBAR select HAVE_MBAR
select CPU_NO_EFFICIENT_FFS
help help
Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support. Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
......
...@@ -30,6 +30,7 @@ config METAG ...@@ -30,6 +30,7 @@ config METAG
select OF select OF
select OF_EARLY_FLATTREE select OF_EARLY_FLATTREE
select SPARSE_IRQ select SPARSE_IRQ
select CPU_NO_EFFICIENT_FFS
config STACKTRACE_SUPPORT config STACKTRACE_SUPPORT
def_bool y def_bool y
......
...@@ -32,6 +32,7 @@ config MICROBLAZE ...@@ -32,6 +32,7 @@ config MICROBLAZE
select OF_EARLY_FLATTREE select OF_EARLY_FLATTREE
select TRACING_SUPPORT select TRACING_SUPPORT
select VIRT_TO_BUS select VIRT_TO_BUS
select CPU_NO_EFFICIENT_FFS
config SWAP config SWAP
def_bool n def_bool n
......
...@@ -204,6 +204,16 @@ ...@@ -204,6 +204,16 @@
#endif #endif
#endif #endif
/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
(defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
(defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
(defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
(defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
(defined(cpu_has_mips64r6) && cpu_has_mips64r6))
#define CPU_NO_EFFICIENT_FFS 1
#endif
#ifndef cpu_has_mips_1 #ifndef cpu_has_mips_1
# define cpu_has_mips_1 (!cpu_has_mips_r6) # define cpu_has_mips_1 (!cpu_has_mips_r6)
#endif #endif
......
...@@ -15,6 +15,7 @@ config NIOS2 ...@@ -15,6 +15,7 @@ config NIOS2
select SOC_BUS select SOC_BUS
select SPARSE_IRQ select SPARSE_IRQ
select USB_ARCH_HAS_HCD if USB_SUPPORT select USB_ARCH_HAS_HCD if USB_SUPPORT
select CPU_NO_EFFICIENT_FFS
config GENERIC_CSUM config GENERIC_CSUM
def_bool y def_bool y
......
...@@ -25,6 +25,7 @@ config OPENRISC ...@@ -25,6 +25,7 @@ config OPENRISC
select MODULES_USE_ELF_RELA select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW select HAVE_DEBUG_STACKOVERFLOW
select OR1K_PIC select OR1K_PIC
select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
config MMU config MMU
def_bool y def_bool y
......
...@@ -32,6 +32,7 @@ config PARISC ...@@ -32,6 +32,7 @@ config PARISC
select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SECCOMP_FILTER
select ARCH_NO_COHERENT_DMA_MMAP select ARCH_NO_COHERENT_DMA_MMAP
select CPU_NO_EFFICIENT_FFS
help help
The PA-RISC microprocessor is designed by Hewlett-Packard and used The PA-RISC microprocessor is designed by Hewlett-Packard and used
......
...@@ -123,6 +123,7 @@ config S390 ...@@ -123,6 +123,7 @@ config S390
select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_EARLY_PFN_TO_NID select HAVE_ARCH_EARLY_PFN_TO_NID
select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL
select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_SOFT_DIRTY select HAVE_ARCH_SOFT_DIRTY
select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRACEHOOK
......
...@@ -14,6 +14,7 @@ config SCORE ...@@ -14,6 +14,7 @@ config SCORE
select VIRT_TO_BUS select VIRT_TO_BUS
select MODULES_USE_ELF_REL select MODULES_USE_ELF_REL
select CLONE_BACKWARDS select CLONE_BACKWARDS
select CPU_NO_EFFICIENT_FFS
choice choice
prompt "System type" prompt "System type"
......
...@@ -20,6 +20,7 @@ config SUPERH ...@@ -20,6 +20,7 @@ config SUPERH
select PERF_USE_VMALLOC select PERF_USE_VMALLOC
select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_KMEMLEAK
select HAVE_KERNEL_GZIP select HAVE_KERNEL_GZIP
select CPU_NO_EFFICIENT_FFS
select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA select HAVE_KERNEL_LZMA
select HAVE_KERNEL_XZ select HAVE_KERNEL_XZ
......
...@@ -42,6 +42,7 @@ config SPARC ...@@ -42,6 +42,7 @@ config SPARC
select ODD_RT_SIGACTION select ODD_RT_SIGACTION
select OLD_SIGSUSPEND select OLD_SIGSUSPEND
select ARCH_HAS_SG_CHAIN select ARCH_HAS_SG_CHAIN
select CPU_NO_EFFICIENT_FFS
config SPARC32 config SPARC32
def_bool !64BIT def_bool !64BIT
......
...@@ -2,20 +2,77 @@ ...@@ -2,20 +2,77 @@
#include <linux/gcd.h> #include <linux/gcd.h>
#include <linux/export.h> #include <linux/export.h>
/* Greatest common divisor */ /*
* This implements the binary GCD algorithm. (Often attributed to Stein,
* but as Knuth has noted, appears in a first-century Chinese math text.)
*
* This is faster than the division-based algorithm even on x86, which
* has decent hardware division.
*/
#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS)
/* If __ffs is available, the even/odd algorithm benchmarks slower. */
unsigned long gcd(unsigned long a, unsigned long b) unsigned long gcd(unsigned long a, unsigned long b)
{ {
unsigned long r; unsigned long r = a | b;
if (!a || !b)
return r;
if (a < b) b >>= __ffs(b);
swap(a, b); if (b == 1)
return r & -r;
if (!b) for (;;) {
return a; a >>= __ffs(a);
while ((r = a % b) != 0) { if (a == 1)
a = b; return r & -r;
b = r; if (a == b)
return a << __ffs(r);
if (a < b)
swap(a, b);
a -= b;
} }
return b;
} }
#else
/* If normalization is done by loops, the even/odd algorithm is a win. */
unsigned long gcd(unsigned long a, unsigned long b)
{
unsigned long r = a | b;
if (!a || !b)
return r;
/* Isolate lsbit of r */
r &= -r;
while (!(b & r))
b >>= 1;
if (b == r)
return r;
for (;;) {
while (!(a & r))
a >>= 1;
if (a == r)
return r;
if (a == b)
return a;
if (a < b)
swap(a, b);
a -= b;
a >>= 1;
if (a & r)
a += b;
a >>= 1;
}
}
#endif
EXPORT_SYMBOL_GPL(gcd); EXPORT_SYMBOL_GPL(gcd);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment