Adding benchmarks

d0973c9f · Michael Droettboom · 8875c17b · d0973c9f · d0973c9f · d0973c9f
Commit d0973c9f authored Apr 05, 2018 by Michael Droettboom
42 changed files
--- a/Makefile
+++ b/Makefile
@@ -6,6 +6,7 @@ PYMINOR=$(basename $(PYVERSION))
 CPYTHONROOT=cpython
 CPYTHONLIB=$(CPYTHONROOT)/installs/python-$(PYVERSION)/lib/python$(PYMINOR)
 CPYTHONINC=$(CPYTHONROOT)/installs/python-$(PYVERSION)/include/python$(PYMINOR)
+HOSTPYTHON=$(CPYTHONROOT)/build/$(PYVERSION)/host/bin/python3
 CC=emcc
 CXX=em++
@@ -62,6 +63,11 @@ test: all build/test.html
 	py.test test
+benchmark: all build/test.html
+	python benchmark/benchmark.py $(HOSTPYTHON) $(BUILD)/benchmarks.json
+	python benchmark/plot_benchmark.py benchmarks.json $(BUILD)/benchmarks.png
 clean:
 	rm -fr root
 	rm build/*
@@ -88,6 +94,7 @@ root/.built: \
 	cp src/lazy_import.py $(SITEPACKAGES)
 	cp src/sitecustomize.py $(SITEPACKAGES)
 	cp src/webbrowser.py root/lib/python$(PYMINOR)
+	cp src/pystone.py root/lib/python$(PYMINOR)
 	( \
 		cd root/lib/python$(PYMINOR); \
 		rm -fr `cat ../../../remove_modules.txt`; \

--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
+import json
+import os
+import re
+import subprocess
+import sys
+sys.path.insert(0, os.path.abspath(
+     os.path.join(os.path.dirname(__file__), '..', 'test')))
+import conftest
+SKIP = set(['fft', 'hyantes'])
+def run_native(hostpython, code):
+    output = subprocess.check_output(
+        [os.path.abspath(hostpython), '-c', code],
+        cwd=os.path.dirname(__file__),
+        env={
+            'PYTHONPATH':
+            os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'src'))}
+    )
+    return float(output.strip().split()[-1])
+def run_wasm(code):
+    s = conftest.SeleniumWrapper()
+    s.run(code)
+    runtime = float(s.logs[-1])
+    s.driver.quit()
+    return runtime
+def run_both(hostpython, code):
+    a = run_native(hostpython, code)
+    print(a)
+    b = run_wasm(code)
+    print(b)
+    result = (a, b)
+    return result
+def get_pystone_benchmarks():
+    yield 'pystone', (
+        "import pystone\n"
+        "pystone.main(pystone.LOOPS)\n"
+    )
+def parse_numpy_benchmark(filename):
+    lines = []
+    with open(filename) as fp:
+        for line in fp:
+            m = re.match('^#(setup|run): (.*)$', line)
+            if m:
+                line = '{} = {!r}\n'.format(m.group(1), m.group(2))
+            lines.append(line)
+    return ''.join(lines)
+def get_numpy_benchmarks():
+    root = '../numpy-benchmarks/benchmarks'
+    for filename in os.listdir(root):
+        name = os.path.splitext(filename)[0]
+        if name in SKIP:
+            continue
+        content = parse_numpy_benchmark(os.path.join(root, filename))
+        content += (
+            "setup = setup + '\\nfrom __main__ import {}'\n"
+            "from timeit import Timer\n"
+            "t = Timer(run, setup)\n"
+            "r = t.repeat(11, 40)\n"
+            "import numpy as np\n"
+            "print(np.mean(r))\n".format(name))
+        yield name, content
+def get_benchmarks():
+    yield from get_pystone_benchmarks()
+    yield from get_numpy_benchmarks()
+def main(hostpython):
+    results = {}
+    for k, v in get_benchmarks():
+        print(k)
+        results[k] = run_both(hostpython, v)
+    return results
+if __name__ == '__main__':
+    results = main(sys.argv[-2])
+    with open(sys.argv[-1], 'w') as fp:
+        json.dump(results, fp)
--- a/benchmark/benchmarks/README.md
+++ b/benchmark/benchmarks/README.md
+From https://github.com/serge-sans-paille/numpy-benchmarks
--- a/benchmark/benchmarks/allpairs_distances.py
+++ b/benchmark/benchmarks/allpairs_distances.py
+#setup: import numpy as np ; N = 50 ; X, Y = np.random.randn(100,N), np.random.randn(40,N)
+#run: allpairs_distances(X, Y)
+#pythran export allpairs_distances(float64[][], float64[][])
+import numpy as np
+def allpairs_distances(X,Y):
+  return np.array([[np.sum( (x-y) ** 2) for x in X] for y in Y])
--- a/benchmark/benchmarks/allpairs_distances_loops.py
+++ b/benchmark/benchmarks/allpairs_distances_loops.py
+#setup: import numpy as np ; N = 50 ; X, Y = np.random.randn(50,N), np.random.randn(40,N)
+#run: allpairs_distances_loops(X, Y)
+#pythran export allpairs_distances_loops(float64[][], float64[][])
+import numpy as np
+def allpairs_distances_loops(X,Y):
+  result = np.zeros( (X.shape[0], Y.shape[0]), X.dtype)
+  for i in range(X.shape[0]):
+    for j in range(Y.shape[0]):
+      result[i,j] = np.sum( (X[i,:] - Y[j,:]) ** 2)
+  return result
--- a/benchmark/benchmarks/arc_distance.py
+++ b/benchmark/benchmarks/arc_distance.py
+#setup: N = 10000 ; import numpy as np ; t0, p0, t1, p1 = np.random.randn(N), np.random.randn(N), np.random.randn(N), np.random.randn(N)
+#run: arc_distance(t0, p0, t1, p1)
+#pythran export arc_distance(float64 [], float64[], float64[], float64[])
+import numpy as np
+def arc_distance(theta_1, phi_1,
+                       theta_2, phi_2):
+    """
+    Calculates the pairwise arc distance between all points in vector a and b.
+    """
+    temp = np.sin((theta_2-theta_1)/2)**2+np.cos(theta_1)*np.cos(theta_2)*np.sin((phi_2-phi_1)/2)**2
+    distance_matrix = 2 * (np.arctan2(np.sqrt(temp),np.sqrt(1-temp)))
+    return distance_matrix
--- a/benchmark/benchmarks/check_mask.py
+++ b/benchmark/benchmarks/check_mask.py
+#setup: n=100 ; import numpy as np; db = np.array(np.random.randint(2, size=(n, 4)), dtype=bool)
+#run: check_mask(db)
+#from: http://stackoverflow.com/questions/34500913/numba-slower-for-numpy-bitwise-and-on-boolean-arrays
+#pythran export check_mask(bool[][])
+import numpy as np
+def check_mask(db, mask=[1, 0, 1]):
+    out = np.zeros(db.shape[0],dtype=bool)
+    for idx, line in enumerate(db):
+        target, vector = line[0], line[1:]
+        if (mask == np.bitwise_and(mask, vector)).all():
+            if target == 1:
+                out[idx] = 1
+    return out
--- a/benchmark/benchmarks/conv.py
+++ b/benchmark/benchmarks/conv.py
+#setup: N = 10 ; import numpy as np ; x = np.tri(N,N)*0.5 ; w = np.tri(5,5)*0.25
+#run: conv(x,w)
+#pythran export conv(float[][], float[][])
+import numpy as np
+def clamp(i, offset, maxval):
+    j = max(0, i + offset)
+    return min(j, maxval)
+def reflect(pos, offset, bound):
+    idx = pos+offset
+    return min(2*(bound-1)-idx,max(idx,-idx))
+def conv(x, weights):
+    sx = x.shape
+    sw = weights.shape
+    result = np.zeros_like(x)
+    for i in range(sx[0]):
+        for j in range(sx[1]):
+            for ii in range(sw[0]):
+                for jj in range(sw[1]):
+                    idx = clamp(i,ii-sw[0]//2,sw[0]), clamp(j,jj-sw[0]//2,sw[0])
+                    result[i,j] += x[idx] * weights[ii,jj]
+    return result
--- a/benchmark/benchmarks/create_grid.py
+++ b/benchmark/benchmarks/create_grid.py
+#from: http://stackoverflow.com/questions/13815719/creating-grid-with-numpy-performance
+#pythran export create_grid(float [])
+#setup: import numpy as np ; N = 800 ; x = np.arange(0,1,1./N)
+#run: create_grid(x)
+import numpy as np
+def create_grid(x):
+    N = x.shape[0]
+    z = np.zeros((N, N, 3))
+    z[:,:,0] = x.reshape(-1,1)
+    z[:,:,1] = x
+    fast_grid = z.reshape(N*N, 3)
+    return fast_grid
--- a/benchmark/benchmarks/cronbach.py
+++ b/benchmark/benchmarks/cronbach.py
+#from: http://stackoverflow.com/questions/20799403/improving-performance-of-cronbach-alpha-code-python-numpy
+#pythran export cronbach(float [][])
+#setup: import numpy as np ; N = 600 ; items = np.random.rand(N,N)
+#run: cronbach(items)
+def cronbach(itemscores):
+    itemvars = itemscores.var(axis=1, ddof=1)
+    tscores = itemscores.sum(axis=0)
+    nitems = len(itemscores)
+    return nitems / (nitems-1) * (1 - itemvars.sum() / tscores.var(ddof=1))
--- a/benchmark/benchmarks/diffusion.py
+++ b/benchmark/benchmarks/diffusion.py
+#setup: import numpy as np;lx,ly=(2**7,2**7);u=np.zeros([lx,ly],dtype=np.double);u[lx//2,ly//2]=1000.0;tempU=np.zeros([lx,ly],dtype=np.double)
+#run: diffusion(u,tempU,100)
+#pythran export diffusion(float [][], float [][], int)
+import numpy as np
+def diffusion(u, tempU, iterNum):
+    """
+    Apply Numpy matrix for the Forward-Euler Approximation
+    """
+    mu = .1
+    for n in range(iterNum):
+        tempU[1:-1, 1:-1] = u[1:-1, 1:-1] + mu * (
+            u[2:, 1:-1] - 2 * u[1:-1, 1:-1] + u[0:-2, 1:-1] +
+            u[1:-1, 2:] - 2 * u[1:-1, 1:-1] + u[1:-1, 0:-2])
+        u[:, :] = tempU[:, :]
+        tempU[:, :] = 0.0
--- a/benchmark/benchmarks/evolve.py
+++ b/benchmark/benchmarks/evolve.py
+#setup: import numpy as np ; grid_shape = (512, 512) ; grid = np.zeros(grid_shape) ; block_low = int(grid_shape[0] * .4) ; block_high = int(grid_shape[0] * .5) ; grid[block_low:block_high, block_low:block_high] = 0.005
+#run: evolve(grid, 0.1)
+#from: High Performance Python by Micha Gorelick and Ian Ozsvald, http://shop.oreilly.com/product/0636920028963.do
+#pythran export evolve(float64[][], float)
+import numpy as np
+def laplacian(grid):
+    return np.roll(grid, +1, 0) + np.roll(grid, -1, 0) + np.roll(grid, +1, 1) + np.roll(grid, -1, 1) - 4 * grid
+def evolve(grid, dt, D=1):
+    return grid + dt * D * laplacian(grid)
--- a/benchmark/benchmarks/fdtd.py
+++ b/benchmark/benchmarks/fdtd.py
+#from: http://stackoverflow.com/questions/19367488/converting-function-to-numbapro-cuda
+#setup: N = 10 ; import numpy ; a = numpy.random.rand(N,N)
+#run: fdtd(a,10)
+#pythran export fdtd(float[][], int)
+import numpy as np
+def fdtd(input_grid, steps):
+    grid = input_grid.copy()
+    old_grid = np.zeros_like(input_grid)
+    previous_grid = np.zeros_like(input_grid)
+    l_x = grid.shape[0]
+    l_y = grid.shape[1]
+    for i in range(steps):
+        np.copyto(previous_grid, old_grid)
+        np.copyto(old_grid, grid)
+        for x in range(l_x):
+            for y in range(l_y):
+                grid[x,y] = 0.0
+                if 0 < x+1 < l_x:
+                    grid[x,y] += old_grid[x+1,y]
+                if 0 < x-1 < l_x:
+                    grid[x,y] += old_grid[x-1,y]
+                if 0 < y+1 < l_y:
+                    grid[x,y] += old_grid[x,y+1]
+                if 0 < y-1 < l_y:
+                    grid[x,y] += old_grid[x,y-1]
+                grid[x,y] /= 2.0
+                grid[x,y] -= previous_grid[x,y]
+    return grid
--- a/benchmark/benchmarks/fft.py
+++ b/benchmark/benchmarks/fft.py
+#setup: N = 2**10 ; import numpy ; a = numpy.array(numpy.random.rand(N), dtype=complex)
+#run: fft(a)
+#pythran export fft(complex [])
+import math, numpy as np
+def fft(x):
+   N = x.shape[0]
+   if N == 1:
+       return np.array(x)
+   e=fft(x[::2])
+   o=fft(x[1::2])
+   M=N//2
+   l=[ e[k] + o[k]*math.e**(-2j*math.pi*k/N) for k in range(M) ]
+   r=[ e[k] - o[k]*math.e**(-2j*math.pi*k/N) for k in range(M) ]
+   return np.array(l+r)
--- a/benchmark/benchmarks/grayscott.py
+++ b/benchmark/benchmarks/grayscott.py
+#from http://stackoverflow.com/questions/26823312/numba-or-cython-acceleration-in-reaction-diffusion-algorithm
+#setup: pass
+#run: grayscott(40, 0.16, 0.08, 0.04, 0.06)
+#pythran export grayscott(int, float, float, float, float)
+import numpy as np
+def grayscott(counts, Du, Dv, F, k):
+    n = 300
+    U = np.zeros((n+2,n+2), dtype=np.float32)
+    V = np.zeros((n+2,n+2), dtype=np.float32)
+    u, v = U[1:-1,1:-1], V[1:-1,1:-1]
+    r = 20
+    u[:] = 1.0
+    U[n//2-r:n//2+r,n//2-r:n//2+r] = 0.50
+    V[n//2-r:n//2+r,n//2-r:n//2+r] = 0.25
+    u += 0.15*np.random.random((n,n))
+    v += 0.15*np.random.random((n,n))
+    for i in range(counts):
+        Lu = (                 U[0:-2,1:-1] +
+              U[1:-1,0:-2] - 4*U[1:-1,1:-1] + U[1:-1,2:] +
+                               U[2:  ,1:-1] )
+        Lv = (                 V[0:-2,1:-1] +
+              V[1:-1,0:-2] - 4*V[1:-1,1:-1] + V[1:-1,2:] +
+                               V[2:  ,1:-1] )
+        uvv = u*v*v
+        u += Du*Lu - uvv + F*(1 - u)
+        v += Dv*Lv + uvv - (F + k)*v
+    return V
--- a/benchmark/benchmarks/grouping.py
+++ b/benchmark/benchmarks/grouping.py
+#from: http://stackoverflow.com/questions/4651683/numpy-grouping-using-itertools-groupby-performance
+#setup: import numpy as np ; N = 350000 ; values = np.array(np.random.randint(0,3298,size=N),dtype='u4') ; values.sort()
+#run: grouping(values)
+#pythran export grouping(uint32 [])
+def grouping(values):
+    import numpy as np
+    diff = np.concatenate(([1], np.diff(values)))
+    idx = np.concatenate((np.where(diff)[0], [len(values)]))
+    return values[idx[:-1]], np.diff(idx)
--- a/benchmark/benchmarks/growcut.py
+++ b/benchmark/benchmarks/growcut.py
+#from: http://continuum.io/blog/numba_performance
+#setup: N = 10 ; import numpy as np ; image = np.random.rand(N, N, 3) ; state = np.zeros((N, N, 2)) ; state_next = np.empty_like(state) ; state[0, 0, 0] = state[0, 0, 1] = 1
+#run: growcut(image, state, state_next, 2)
+#pythran export growcut(float[][][], float[][][], float[][][], int)
+import math
+import numpy as np
+def window_floor(idx, radius):
+    if radius > idx:
+        return 0
+    else:
+        return idx - radius
+def window_ceil(idx, ceil, radius):
+    if idx + radius > ceil:
+        return ceil
+    else:
+        return idx + radius
+def growcut(image, state, state_next, window_radius):
+    changes = 0
+    sqrt_3 = math.sqrt(3.0)
+    height = image.shape[0]
+    width = image.shape[1]
+    for j in range(width):
+        for i in range(height):
+            winning_colony = state[i, j, 0]
+            defense_strength = state[i, j, 1]
+            for jj in range(window_floor(j, window_radius),
+                             window_ceil(j+1, width, window_radius)):
+                for ii in range(window_floor(i, window_radius),
+                                 window_ceil(i+1, height, window_radius)):
+                    if (ii != i and jj != j):
+                        d = image[i, j, 0] - image[ii, jj, 0]
+                        s = d * d
+                        for k in range(1, 3):
+                            d = image[i, j, k] - image[ii, jj, k]
+                            s += d * d
+                        gval = 1.0 - math.sqrt(s)/sqrt_3
+                        attack_strength = gval * state[ii, jj, 1]
+                        if attack_strength > defense_strength:
+                            defense_strength = attack_strength
+                            winning_colony = state[ii, jj, 0]
+                            changes += 1
+            state_next[i, j, 0] = winning_colony
+            state_next[i, j, 1] = defense_strength
+    return changes
--- a/benchmark/benchmarks/harris.py
+++ b/benchmark/benchmarks/harris.py
+#from: parakeet testbed
+#setup: import numpy as np ; M, N = 512, 512 ; I = np.random.randn(M,N)
+#run: harris(I)
+#pythran export harris(float64[][])
+import numpy as np
+def harris(I):
+  m,n = I.shape
+  dx = (I[1:, :] - I[:m-1, :])[:, 1:]
+  dy = (I[:, 1:] - I[:, :n-1])[1:, :]
+  #
+  #   At each point we build a matrix
+  #   of derivative products
+  #   M =
+  #   | A = dx^2     C = dx * dy |
+  #   | C = dy * dx  B = dy * dy |
+  #
+  #   and the score at that point is:
+  #      det(M) - k*trace(M)^2
+  #
+  A = dx * dx
+  B = dy * dy
+  C = dx * dy
+  tr = A + B
+  det = A * B - C * C
+  k = 0.05
+  return det - k * tr * tr
--- a/benchmark/benchmarks/hasting.py
+++ b/benchmark/benchmarks/hasting.py
+#from: http://wiki.scipy.org/Cookbook/Theoretical_Ecology/Hastings_and_Powell
+#setup: import numpy as np ; y = np.random.rand(3) ; args = np.random.rand(7)
+#run: hasting(y, *args)
+#pythran export hasting(float [], float, float, float, float, float, float, float)
+import numpy as np
+def hasting(y, t, a1, a2, b1, b2, d1, d2):
+    yprime = np.empty((3,))
+    yprime[0] = y[0] * (1. - y[0]) - a1*y[0]*y[1]/(1. + b1 * y[0])
+    yprime[1] = a1*y[0]*y[1] / (1. + b1 * y[0]) - a2 * y[1]*y[2] / (1. + b2 * y[1]) - d1 * y[1]
+    yprime[2] = a2*y[1]*y[2]/(1. + b2*y[1]) - d2*y[2]
+    return yprime
--- a/benchmark/benchmarks/hyantes.py
+++ b/benchmark/benchmarks/hyantes.py
+#setup: import numpy ; a = numpy.array([ [i/10., i/10., i/20.] for i in range(44440)], dtype=numpy.double)
+#run: hyantes(0, 0, 90, 90, 1, 100, 80, 80, a)
+#pythran export hyantes(float, float, float, float, float, float, int, int, float[][])
+import numpy as np
+def hyantes(xmin, ymin, xmax, ymax, step, range_, range_x, range_y, t):
+    X,Y = t.shape
+    pt = np.zeros((X,Y))
+    for i in range(X):
+        for j in range(Y):
+            for k in t:
+                tmp = 6368.* np.arccos( np.cos(xmin+step*i)*np.cos( k[0] ) * np.cos((ymin+step*j)-k[1])+  np.sin(xmin+step*i)*np.sin(k[0]))
+                if tmp < range_:
+                    pt[i,j]+=k[2] / (1+tmp)
+    return pt
--- a/benchmark/benchmarks/julia.py
+++ b/benchmark/benchmarks/julia.py
+#setup: N=50
+#run: julia(1., 1., N, 1.5, 10., 1e4)
+#pythran export julia(float, float, int, float, float, float)
+import numpy as np
+from time import time
+def kernel(zr, zi, cr, ci, lim, cutoff):
+    ''' Computes the number of iterations `n` such that
+        |z_n| > `lim`, where `z_n = z_{n-1}**2 + c`.
+    '''
+    count = 0
+    while ((zr*zr + zi*zi) < (lim*lim)) and count < cutoff:
+        zr, zi = zr * zr - zi * zi + cr, 2 * zr * zi + ci
+        count += 1
+    return count
+def julia(cr, ci, N, bound=1.5, lim=1000., cutoff=1e6):
+    ''' Pure Python calculation of the Julia set for a given `c`.  No NumPy
+        array operations are used.
+    '''
+    julia = np.empty((N, N), np.uint32)
+    grid_x = np.linspace(-bound, bound, N)
+    for i, x in enumerate(grid_x):
+        for j, y in enumerate(grid_x):
+            julia[i,j] = kernel(x, y, cr, ci, lim, cutoff)
+    return julia
--- a/benchmark/benchmarks/l2norm.py
+++ b/benchmark/benchmarks/l2norm.py
+#from: http://stackoverflow.com/questions/7741878/how-to-apply-numpy-linalg-norm-to-each-row-of-a-matrix/7741976#7741976
+#setup: import numpy as np ; N = 1000; x = np.random.rand(N,N)
+#run: l2norm(x)
+#pythran export l2norm(float64[][])
+import numpy as np
+def l2norm(x):
+    return np.sqrt(np.sum(np.abs(x)**2, 1))
--- a/benchmark/benchmarks/local_maxima.py
+++ b/benchmark/benchmarks/local_maxima.py
+#from: https://github.com/iskandr/parakeet/blob/master/benchmarks/nd_local_maxima.py
+#setup: import numpy as np ; shape = (3,4,3,2) ; x = np.arange(72, dtype=np.float64).reshape(*shape)
+#run: local_maxima(x)
+#pythran export local_maxima(float [][][][])
+import numpy as np
+def wrap(pos, offset, bound):
+    return ( pos + offset ) % bound
+def clamp(pos, offset, bound):
+    return min(bound-1,max(0,pos+offset))
+def reflect(pos, offset, bound):
+    idx = pos+offset
+    return min(2*(bound-1)-idx,max(idx,-idx))
+def local_maxima(data, mode=wrap):
+  wsize = data.shape
+  result = np.ones(data.shape, bool)
+  for pos in np.ndindex(data.shape):
+    myval = data[pos]
+    for offset in np.ndindex(wsize):
+      neighbor_idx = tuple(mode(p, o-w//2, w) for (p, o, w) in zip(pos, offset, wsize))
+      result[pos] &= (data[neighbor_idx] <= myval)
+  return result
--- a/benchmark/benchmarks/log_likelihood.py
+++ b/benchmark/benchmarks/log_likelihood.py
+#setup: import numpy as np ; N = 100000 ; a = np.random.random(N); b = 0.1; c =1.1
+#run: log_likelihood(a, b, c)
+#from: http://arogozhnikov.github.io/2015/09/08/SpeedBenchmarks.html
+import numpy
+#pythran export log_likelihood(float64[], float64, float64)
+def log_likelihood(data, mean, sigma):
+    s = (data - mean) ** 2 / (2 * (sigma ** 2))
+    pdfs = numpy.exp(- s)
+    pdfs /= numpy.sqrt(2 * numpy.pi) * sigma
+    return numpy.log(pdfs).sum()
--- a/benchmark/benchmarks/lstsqr.py
+++ b/benchmark/benchmarks/lstsqr.py
+#setup: import numpy as np ; N = 500000 ; X, Y = np.random.rand(N), np.random.rand(N)
+#run: lstsqr(X, Y)
+#from: http://nbviewer.ipython.org/github/rasbt/One-Python-benchmark-per-day/blob/master/ipython_nbs/day10_fortran_lstsqr.ipynb
+#pythran export lstsqr(float[], float[])
+import numpy as np
+def lstsqr(x, y):
+    """ Computes the least-squares solution to a linear matrix equation. """
+    x_avg = np.average(x)
+    y_avg = np.average(y)
+    dx = x - x_avg
+    dy = y - y_avg
+    var_x = np.sum(dx**2)
+    cov_xy = np.sum(dx * (y - y_avg))
+    slope = cov_xy / var_x
+    y_interc = y_avg - slope*x_avg
+    return (slope, y_interc)
--- a/benchmark/benchmarks/mandel.py
+++ b/benchmark/benchmarks/mandel.py
+#setup: import numpy as np; image = np.zeros((64, 32), dtype = np.uint8)
+#run: mandel(-2.0, 1.0, -1.0, 1.0, image, 20)
+#pythran export mandel(float, float, float, float, uint8[][], int)
+def kernel(x, y, max_iters):
+  """
+    Given the real and imaginary parts of a complex number,
+    determine if it is a candidate for membership in the Mandelbrot
+    set given a fixed number of iterations.
+  """
+  c = complex(x, y)
+  z = 0.0j
+  for i in range(max_iters):
+    z = z*z + c
+    if (z.real*z.real + z.imag*z.imag) >= 4:
+      return i
+  return max_iters
+def mandel(min_x, max_x, min_y, max_y, image, iters):
+  height = image.shape[0]
+  width = image.shape[1]
+  pixel_size_x = (max_x - min_x) / width
+  pixel_size_y = (max_y - min_y) / height
+  for x in range(width):
+    real = min_x + x * pixel_size_x
+    for y in range(height):
+      imag = min_y + y * pixel_size_y
+      color = kernel(real, imag, iters)
+      image[y, x] = color
--- a/benchmark/benchmarks/multiple_sum.py
+++ b/benchmark/benchmarks/multiple_sum.py
+#from http://stackoverflow.com/questions/77999777799977/numpy-vs-cython-speed
+#pythran export multiple_sum(float[][])
+#setup: import numpy as np ; r = np.random.rand(100,100)
+#run: multiple_sum(r)
+import numpy as np
+def multiple_sum(array):
+    rows = array.shape[0]
+    cols = array.shape[1]
+    out = np.zeros((rows, cols))
+    for row in range(0, rows):
+        out[row, :] = np.sum(array - array[row, :], 0)
+    return out
--- a/benchmark/benchmarks/pairwise.py
+++ b/benchmark/benchmarks/pairwise.py
+#from: http://jakevdp.github.com/blog/2012/08/24/numba-vs-cython/
+#setup: import numpy as np ; X = np.linspace(0,10,200).reshape(20,10)
+#run: pairwise(X)
+#pythran export pairwise(float [][])
+import numpy as np
+def pairwise(X):
+    M, N = X.shape
+    D = np.empty((M,M))
+    for i in range(M):
+        for j in range(M):
+            d = 0.0
+            for k in range(N):
+                tmp = X[i,k] - X[j,k]
+                d += tmp * tmp
+            D[i,j] = np.sqrt(d)
+    return D
--- a/benchmark/benchmarks/periodic_dist.py
+++ b/benchmark/benchmarks/periodic_dist.py
+#setup: import numpy as np ; N = 20 ; x = y = z = np.arange(0., N, 0.1) ; L = 4 ; periodic = True
+#run: periodic_dist(x, x, x, L,periodic, periodic, periodic)
+#pythran export periodic_dist(float [], float[], float[], int, bool, bool, bool)
+import numpy as np 
+def periodic_dist(x, y, z, L, periodicX, periodicY, periodicZ):
+    " ""Computes distances between all particles and places the result in a matrix such that the ij th matrix entry corresponds to the distance between particle i and j"" "
+    N = len(x)
+    xtemp = np.tile(x,(N,1))
+    dx = xtemp - xtemp.T
+    ytemp = np.tile(y,(N,1))
+    dy = ytemp - ytemp.T
+    ztemp = np.tile(z,(N,1))
+    dz = ztemp - ztemp.T
+    # Particles 'feel' each other across the periodic boundaries
+    if periodicX:
+        dx[dx>L/2]=dx[dx > L/2]-L
+        dx[dx<-L/2]=dx[dx < -L/2]+L
+    if periodicY:
+        dy[dy>L/2]=dy[dy>L/2]-L
+        dy[dy<-L/2]=dy[dy<-L/2]+L
+    if periodicZ:
+        dz[dz>L/2]=dz[dz>L/2]-L
+        dz[dz<-L/2]=dz[dz<-L/2]+L
+    # Total Distances
+    d = np.sqrt(dx**2+dy**2+dz**2)
+    # Mark zero entries with negative 1 to avoid divergences
+    d[d==0] = -1
+    return d, dx, dy, dz
--- a/benchmark/benchmarks/repeating.py
+++ b/benchmark/benchmarks/repeating.py
+#from: http://stackoverflow.com/questions/14553331/how-to-improve-numpy-performance-in-this-short-code
+#pythran export repeating(float[], int)
+#setup: import numpy as np ; a = np.random.rand(10000)
+#run: repeating(a, 20)
+import numpy as np
+def repeating(x, nvar_y):
+    nvar_x = x.shape[0]
+    y = np.empty(nvar_x*(1+nvar_y))
+    y[0:nvar_x] = x[0:nvar_x]
+    y[nvar_x:] = np.repeat(x,nvar_y)
+    return y
--- a/benchmark/benchmarks/reverse_cumsum.py
+++ b/benchmark/benchmarks/reverse_cumsum.py
+#from: http://stackoverflow.com/questions/16541618/perform-a-reverse-cumulative-sum-on-a-numpy-array
+#pythran export reverse_cumsum(float[])
+#setup: import numpy as np ; r = np.random.rand(1000000)
+#run: reverse_cumsum(r)
+import numpy as np
+def reverse_cumsum(x):
+    return np.cumsum(x[::-1])[::-1]
--- a/benchmark/benchmarks/rosen.py
+++ b/benchmark/benchmarks/rosen.py
+#setup: import numpy as np; r = np.arange(1000000, dtype=float)
+#run: rosen(r)
+import numpy as np
+#pythran export rosen(float[])
+def rosen(x):
+    t0 = 100 * (x[1:] - x[:-1] ** 2) ** 2
+    t1 = (1 - x[:-1]) ** 2
+    return np.sum(t0 + t1)
--- a/benchmark/benchmarks/slowparts.py
+++ b/benchmark/benchmarks/slowparts.py
+#from: https://groups.google.com/forum/#!topic/parakeet-python/p-flp2kdE4U
+#setup: import numpy as np ;d = 10 ;re = 5 ;params = (d, re, np.ones((2*d, d+1, re)), np.ones((d, d+1, re)),  np.ones((d, 2*d)), np.ones((d, 2*d)), np.ones((d+1, re, d)), np.ones((d+1, re, d)), 1)
+#run: slowparts(*params)
+#pythran export slowparts(int, int, float [][][], float [][][], float [][], float [][], float [][][], float [][][], int)
+from numpy import zeros, power, tanh
+def slowparts(d, re, preDz, preWz, SRW, RSW, yxV, xyU, resid):
+    """ computes the linear algebra intensive part of the gradients of the grae
+    """
+    fprime = lambda x: 1 - power(tanh(x), 2)
+    partialDU = zeros((d+1, re, 2*d, d))
+    for k in range(2*d):
+        for i in range(d):
+            partialDU[:,:,k,i] = fprime(preDz[k]) * fprime(preWz[i]) * (SRW[i,k] + RSW[i,k]) * yxV[:,:,i]
+    return partialDU
--- a/benchmark/benchmarks/smoothing.py
+++ b/benchmark/benchmarks/smoothing.py
+#setup: import numpy as np ; N = 1000 ; a = np.arange(0,1,N)
+#run: smoothing(a, .4)
+#from: http://www.parakeetpython.com/
+#pythran export smoothing(float[], float)
+def smoothing(x, alpha):
+  """
+  Exponential smoothing of a time series
+  For x = 10**6 floats
+  - Python runtime: 9 seconds
+  - Parakeet runtime: .01 seconds
+  """
+  s = x.copy()
+  for i in range(1, len(x)):
+    s[i] = alpha * x[i] + (1 - alpha) * s[i-1]
+  return s
--- a/benchmark/benchmarks/specialconvolve.py
+++ b/benchmark/benchmarks/specialconvolve.py
+#from: http://stackoverflow.com/questions/2196693/improving-numpy-performance
+#pythran export specialconvolve(uint32 [][])
+#setup: import numpy as np ; r = np.arange(100*10000, dtype=np.uint32).reshape(1000,1000)
+#run: specialconvolve(r)
+def specialconvolve(a):
+    # sorry, you must pad the input yourself
+    rowconvol = a[1:-1,:] + a[:-2,:] + a[2:,:]
+    colconvol = rowconvol[:,1:-1] + rowconvol[:,:-2] + rowconvol[:,2:] - 9*a[1:-1,1:-1]
+    return colconvol
--- a/benchmark/benchmarks/vibr_energy.py
+++ b/benchmark/benchmarks/vibr_energy.py
+#from: http://stackoverflow.com/questions/17112550/python-and-numba-for-vectorized-functions
+#setup: import numpy as np ; N = 100000 ; a, b, c = np.random.rand(N), np.random.rand(N), np.random.rand(N)
+#run: vibr_energy(a, b, c)
+#pythran export vibr_energy(float64[], float64[], float64[])
+import numpy
+def vibr_energy(harmonic, anharmonic, i):
+    return numpy.exp(-harmonic * i - anharmonic * (i ** 2))
--- a/benchmark/benchmarks/wave.py
+++ b/benchmark/benchmarks/wave.py
+#from https://github.com/sklam/numba-example-wavephysics
+#setup: N=100
+#run: wave(N)
+import numpy as np
+from math import ceil
+def physics(masspoints, dt, plunk, which):
+  ppos = masspoints[1]
+  cpos = masspoints[0]
+  N = cpos.shape[0]
+  # apply hooke's law
+  HOOKE_K = 2100000.
+  DAMPING = 0.0001
+  MASS = .01
+  force = np.zeros((N, 2))
+  for i in range(1, N):
+    dx, dy = cpos[i] - cpos[i - 1]
+    dist = np.sqrt(dx**2 + dy**2)
+    assert dist != 0
+    fmag = -HOOKE_K * dist
+    cosine = dx / dist
+    sine = dy / dist
+    fvec = np.array([fmag * cosine, fmag * sine])
+    force[i - 1] -= fvec
+    force[i] += fvec
+  force[0] = force[-1] = 0, 0
+  force[which][1] += plunk
+  accel = force / MASS
+  # verlet integration
+  npos = (2 - DAMPING) * cpos - (1 - DAMPING) * ppos + accel * (dt**2)
+  masspoints[1] = cpos
+  masspoints[0] = npos
+#pythran export wave(int)
+def wave(PARTICLE_COUNT):
+    SUBDIVISION = 300
+    FRAMERATE = 60
+    count = PARTICLE_COUNT
+    width, height = 1200, 400
+    masspoints = np.empty((2, count, 2), np.float64)
+    initpos = np.zeros(count, np.float64)
+    for i in range(1, count):
+        initpos[i] = initpos[i - 1] + float(width) / count
+    masspoints[:, :, 0] = initpos
+    masspoints[:, :, 1] = height / 2
+    f = 15
+    plunk_pos = count // 2
+    physics( masspoints, 1./ (SUBDIVISION * FRAMERATE), f, plunk_pos)
+    return masspoints[0, count // 2]
--- a/benchmark/benchmarks/wdist.py
+++ b/benchmark/benchmarks/wdist.py
+#from: http://stackoverflow.com/questions/19277244/fast-weighted-euclidean-distance-between-points-in-arrays/19277334#19277334
+#setup: import numpy as np ; N = 10 ; A = np.random.rand(N,N) ; B =  np.random.rand(N,N) ; W = np.random.rand(N,N)
+#run: wdist(A,B,W)
+#pythran export wdist(float64 [][], float64 [][], float64[][])
+import numpy as np
+def wdist(A, B, W):
+    k,m = A.shape
+    _,n = B.shape
+    D = np.zeros((m, n))
+    for ii in range(m):
+        for jj in range(n):
+            wdiff = (A[:,ii] - B[:,jj]) / W[:,ii]
+            D[ii,jj] = np.sqrt((wdiff**2).sum())
+    return D
--- a/benchmark/plot_benchmark.py
+++ b/benchmark/plot_benchmark.py
+import matplotlib.pyplot as plt
+import numpy as np
+import json
+import sys
+plt.rcdefaults()
+fig, ax = plt.subplots(constrained_layout=True, figsize=(8, 8))
+with open(sys.argv[-2]) as fp:
+    content = json.load(fp)
+results = []
+for k, v in content.items():
+    results.append((k, v[1] / v[0]))
+results.sort(key=lambda x: x[1], reverse=True)
+names = [x[0] for x in results]
+values = [x[1] for x in results]
+y_pos = np.arange(len(results))
+ax.barh(y_pos, values, align='center')
+ax.set_yticks(y_pos)
+ax.set_yticklabels(names)
+ax.invert_yaxis()
+ax.set_xlabel('Slowdown factor')
+ax.set_title('Native vs. WebAssembly Python benchmarks')
+ax.grid()
+plt.savefig(sys.argv[-1])
--- a/numpy/Makefile
+++ b/numpy/Makefile
@@ -77,7 +77,7 @@ $(HOSTDIR)/setup.py: $(ZIPFILE)
 $(HOSTBUILD)/lib.$(PLATFORMSLUG)/numpy/__init__.py: $(ROOT)/.patched
 	( \
 	  cd $(HOSTDIR); \
-    $(HOSTPYTHON) setup.py build \
+    $(HOSTPYTHON) setup.py install \
 	)

--- a/src/pystone.py
+++ b/src/pystone.py
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+"PYSTONE" Benchmark Program
+Version:        Python/1.1 (corresponds to C/1.1 plus 2 Pystone fixes)
+Author:         Reinhold P. Weicker,  CACM Vol 27, No 10, 10/84 pg. 1013.
+                Translated from ADA to C by Rick Richardson.
+                Every method to preserve ADA-likeness has been used,
+                at the expense of C-ness.
+                Translated from C to Python by Guido van Rossum.
+Version History:
+                Inofficial version 1.1.1 by Chris Arndt:
+                - Make it run under Python 2 and 3 by using
+                  "from __future__ import print_function".
+                - Change interpreter name in shebang line to plain
+                  "python".
+                - Add source code encoding declaration.
+                Version 1.1 corrects two bugs in version 1.0:
+                First, it leaked memory: in Proc1(), NextRecord ends
+                up having a pointer to itself.  I have corrected this
+                by zapping NextRecord.PtrComp at the end of Proc1().
+                Second, Proc3() used the operator != to compare a
+                record to None.  This is rather inefficient and not
+                true to the intention of the original benchmark (where
+                a pointer comparison to None is intended; the !=
+                operator attempts to find a method __cmp__ to do value
+                comparison of the record).  Version 1.1 runs 5-10
+                percent faster than version 1.0, so benchmark figures
+                of different versions can't be compared directly.
+"""
+from __future__ import print_function
+LOOPS = 50000
+from time import clock
+__version__ = "1.1.1"
+[Ident1, Ident2, Ident3, Ident4, Ident5] = range(1, 6)
+class Record:
+    def __init__(self, PtrComp = None, Discr = 0, EnumComp = 0,
+                       IntComp = 0, StringComp = 0):
+        self.PtrComp = PtrComp
+        self.Discr = Discr
+        self.EnumComp = EnumComp
+        self.IntComp = IntComp
+        self.StringComp = StringComp
+    def copy(self):
+        return Record(self.PtrComp, self.Discr, self.EnumComp,
+                      self.IntComp, self.StringComp)
+TRUE = 1
+FALSE = 0
+def main(loops=LOOPS):
+    benchtime, stones = pystones(loops)
+    print("%g" % benchtime)
+def pystones(loops=LOOPS):
+    return Proc0(loops)
+IntGlob = 0
+BoolGlob = FALSE
+Char1Glob = '\0'
+Char2Glob = '\0'
+Array1Glob = [0]*51
+Array2Glob = [x[:] for x in [Array1Glob]*51]
+PtrGlb = None
+PtrGlbNext = None
+def Proc0(loops=LOOPS):
+    global IntGlob
+    global BoolGlob
+    global Char1Glob
+    global Char2Glob
+    global Array1Glob
+    global Array2Glob
+    global PtrGlb
+    global PtrGlbNext
+    starttime = clock()
+    for i in range(loops):
+        pass
+    nulltime = clock() - starttime
+    PtrGlbNext = Record()
+    PtrGlb = Record()
+    PtrGlb.PtrComp = PtrGlbNext
+    PtrGlb.Discr = Ident1
+    PtrGlb.EnumComp = Ident3
+    PtrGlb.IntComp = 40
+    PtrGlb.StringComp = "DHRYSTONE PROGRAM, SOME STRING"
+    String1Loc = "DHRYSTONE PROGRAM, 1'ST STRING"
+    Array2Glob[8][7] = 10
+    starttime = clock()
+    for i in range(loops):
+        Proc5()
+        Proc4()
+        IntLoc1 = 2
+        IntLoc2 = 3
+        String2Loc = "DHRYSTONE PROGRAM, 2'ND STRING"
+        EnumLoc = Ident2
+        BoolGlob = not Func2(String1Loc, String2Loc)
+        while IntLoc1 < IntLoc2:
+            IntLoc3 = 5 * IntLoc1 - IntLoc2
+            IntLoc3 = Proc7(IntLoc1, IntLoc2)
+            IntLoc1 = IntLoc1 + 1
+        Proc8(Array1Glob, Array2Glob, IntLoc1, IntLoc3)
+        PtrGlb = Proc1(PtrGlb)
+        CharIndex = 'A'
+        while CharIndex <= Char2Glob:
+            if EnumLoc == Func1(CharIndex, 'C'):
+                EnumLoc = Proc6(Ident1)
+            CharIndex = chr(ord(CharIndex)+1)
+        IntLoc3 = IntLoc2 * IntLoc1
+        IntLoc2 = IntLoc3 / IntLoc1
+        IntLoc2 = 7 * (IntLoc3 - IntLoc2) - IntLoc1
+        IntLoc1 = Proc2(IntLoc1)
+    benchtime = clock() - starttime - nulltime
+    if benchtime == 0.0:
+        loopsPerBenchtime = 0.0
+    else:
+        loopsPerBenchtime = (loops / benchtime)
+    return benchtime, loopsPerBenchtime
+def Proc1(PtrParIn):
+    PtrParIn.PtrComp = NextRecord = PtrGlb.copy()
+    PtrParIn.IntComp = 5
+    NextRecord.IntComp = PtrParIn.IntComp
+    NextRecord.PtrComp = PtrParIn.PtrComp
+    NextRecord.PtrComp = Proc3(NextRecord.PtrComp)
+    if NextRecord.Discr == Ident1:
+        NextRecord.IntComp = 6
+        NextRecord.EnumComp = Proc6(PtrParIn.EnumComp)
+        NextRecord.PtrComp = PtrGlb.PtrComp
+        NextRecord.IntComp = Proc7(NextRecord.IntComp, 10)
+    else:
+        PtrParIn = NextRecord.copy()
+    NextRecord.PtrComp = None
+    return PtrParIn
+def Proc2(IntParIO):
+    IntLoc = IntParIO + 10
+    while 1:
+        if Char1Glob == 'A':
+            IntLoc = IntLoc - 1
+            IntParIO = IntLoc - IntGlob
+            EnumLoc = Ident1
+        if EnumLoc == Ident1:
+            break
+    return IntParIO
+def Proc3(PtrParOut):
+    global IntGlob
+    if PtrGlb is not None:
+        PtrParOut = PtrGlb.PtrComp
+    else:
+        IntGlob = 100
+    PtrGlb.IntComp = Proc7(10, IntGlob)
+    return PtrParOut
+def Proc4():
+    global Char2Glob
+    BoolLoc = Char1Glob == 'A'
+    BoolLoc = BoolLoc or BoolGlob
+    Char2Glob = 'B'
+def Proc5():
+    global Char1Glob
+    global BoolGlob
+    Char1Glob = 'A'
+    BoolGlob = FALSE
+def Proc6(EnumParIn):
+    EnumParOut = EnumParIn
+    if not Func3(EnumParIn):
+        EnumParOut = Ident4
+    if EnumParIn == Ident1:
+        EnumParOut = Ident1
+    elif EnumParIn == Ident2:
+        if IntGlob > 100:
+            EnumParOut = Ident1
+        else:
+            EnumParOut = Ident4
+    elif EnumParIn == Ident3:
+        EnumParOut = Ident2
+    elif EnumParIn == Ident4:
+        pass
+    elif EnumParIn == Ident5:
+        EnumParOut = Ident3
+    return EnumParOut
+def Proc7(IntParI1, IntParI2):
+    IntLoc = IntParI1 + 2
+    IntParOut = IntParI2 + IntLoc
+    return IntParOut
+def Proc8(Array1Par, Array2Par, IntParI1, IntParI2):
+    global IntGlob
+    IntLoc = IntParI1 + 5
+    Array1Par[IntLoc] = IntParI2
+    Array1Par[IntLoc+1] = Array1Par[IntLoc]
+    Array1Par[IntLoc+30] = IntLoc
+    for IntIndex in range(IntLoc, IntLoc+2):
+        Array2Par[IntLoc][IntIndex] = IntLoc
+    Array2Par[IntLoc][IntLoc-1] = Array2Par[IntLoc][IntLoc-1] + 1
+    Array2Par[IntLoc+20][IntLoc] = Array1Par[IntLoc]
+    IntGlob = 5
+def Func1(CharPar1, CharPar2):
+    CharLoc1 = CharPar1
+    CharLoc2 = CharLoc1
+    if CharLoc2 != CharPar2:
+        return Ident1
+    else:
+        return Ident2
+def Func2(StrParI1, StrParI2):
+    IntLoc = 1
+    while IntLoc <= 1:
+        if Func1(StrParI1[IntLoc], StrParI2[IntLoc+1]) == Ident1:
+            CharLoc = 'A'
+            IntLoc = IntLoc + 1
+    if CharLoc >= 'W' and CharLoc <= 'Z':
+        IntLoc = 7
+    if CharLoc == 'X':
+        return TRUE
+    else:
+        if StrParI1 > StrParI2:
+            IntLoc = IntLoc + 7
+            return TRUE
+        else:
+            return FALSE
+def Func3(EnumParIn):
+    EnumLoc = EnumParIn
+    if EnumLoc == Ident3: return TRUE
+    return FALSE
+if __name__ == '__main__':
+    import sys
+    def error(msg):
+        print(msg, end=' ', file=sys.stderr)
+        print("usage: %s [number_of_loops]" % sys.argv[0], file=sys.stderr)
+        sys.exit(100)
+    nargs = len(sys.argv) - 1
+    if nargs > 1:
+        error("%d arguments are too many;" % nargs)
+    elif nargs == 1:
+        try: loops = int(sys.argv[1])
+        except ValueError:
+            error("Invalid argument %r;" % sys.argv[1])
+    else:
+        loops = LOOPS
+    main(loops)
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -4,7 +4,10 @@ Various common utilities for testing.
 import pathlib
-import pytest
+try:
+    import pytest
+except ImportError:
+    pytest = None
 TEST_PATH = pathlib.Path(__file__).parents[0].resolve()
@@ -47,8 +50,9 @@ class SeleniumWrapper:
            yield self.driver.current_url
-@pytest.fixture
+if pytest is not None:
-def selenium():
+    @pytest.fixture
-    selenium = SeleniumWrapper()
+    def selenium():
-    yield selenium
+        selenium = SeleniumWrapper()
-    selenium.driver.quit()
+        yield selenium
+        selenium.driver.quit()