Commit 20547723 authored by gabrieldemarmiesse's avatar gabrieldemarmiesse

Changed the numpy tutorial to make is faster to understand. Added prange example.

parent 084a25f5
# cython: infer_types=True
import numpy as np
cimport cython
ctypedef fused my_type:
int
double
long
cdef my_type clip(my_type a, my_type min_value, my_type max_value):
return min(max(a, min_value), max_value)
@cython.boundscheck(False)
@cython.wraparound(False)
def compute(my_type[:, ::1] array_1, my_type[:, ::1] array_2, my_type a, my_type b, my_type c):
x_max = array_1.shape[0]
y_max = array_1.shape[1]
assert tuple(array_1.shape) == tuple(array_2.shape)
if my_type == int:
dtype = np.intc
elif my_type == double:
dtype = np.double
else:
dtype = np.long
result = np.zeros((x_max, y_max), dtype=dtype)
cdef my_type[:, ::1] result_view = result
cdef my_type tmp
cdef Py_ssize_t x, y
for x in range(x_max):
for y in range(y_max):
tmp = clip(array_1[x, y], 2, 10)
tmp = tmp * a + array_2[x, y] * b
result_view[x, y] = tmp + c
return result
# cython: infer_types=True
import numpy as np
cimport cython
DTYPE = np.intc
cdef int clip(int a, int min_value, int max_value):
return min(max(a, min_value), max_value)
@cython.boundscheck(False)
@cython.wraparound(False)
def compute(int[:, ::1] array_1, int[:, ::1] array_2, int a, int b, int c):
x_max = array_1.shape[0]
y_max = array_1.shape[1]
assert tuple(array_1.shape) == tuple(array_2.shape)
result = np.zeros((x_max, y_max), dtype=DTYPE)
cdef int[:, ::1] result_view = result
cdef int tmp
cdef Py_ssize_t x, y
for x in range(x_max):
for y in range(y_max):
tmp = clip(array_1[x, y], 2, 10)
tmp = tmp * a + array_2[x, y] * b
result_view[x, y] = tmp + c
return result
import numpy as np
DTYPE = np.intc
cdef int clip(int a, int min_value, int max_value):
return min(max(a, min_value), max_value)
def compute(int[:, :] array_1, int[:, :] array_2, int a, int b, int c):
cdef Py_ssize_t x_max = array_1.shape[0]
cdef Py_ssize_t y_max = array_1.shape[1]
# array_1.shape is now a C array, no it's not possible
# to compare it simply by using == without a for-loop.
# To be able to compare it to array_2.shape easily,
# we convert them both to Python tuples.
assert tuple(array_1.shape) == tuple(array_2.shape)
result = np.zeros((x_max, y_max), dtype=DTYPE)
cdef int[:, :] result_view = result
cdef int tmp
cdef Py_ssize_t x, y
for x in range(x_max):
for y in range(y_max):
tmp = clip(array_1[x, y], 2, 10)
tmp = tmp * a + array_2[x, y] * b
result_view[x, y] = tmp + c
return result
import numpy as np
cimport cython
from cython.parallel import prange
ctypedef fused my_type:
int
double
long
# We declare our plain c function nogil
cdef my_type clip(my_type a, my_type min_value, my_type max_value) nogil:
return min(max(a, min_value), max_value)
@cython.boundscheck(False)
@cython.wraparound(False)
def compute(my_type[:, ::] array_1, my_type[:, ::1] array_2, my_type a, my_type b, my_type c):
cdef Py_ssize_t x_max = array_1.shape[0]
cdef Py_ssize_t y_max = array_1.shape[1]
assert tuple(array_1.shape) == tuple(array_2.shape)
if my_type == int:
dtype = np.intc
elif my_type == double:
dtype = np.double
else:
dtype = np.long
result = np.zeros((x_max, y_max), dtype=dtype)
cdef my_type[:, ::1] result_view = result
cdef my_type tmp
cdef Py_ssize_t x, y
# We use prange here.
for x in prange(x_max, nogil=True):
for y in range(y_max):
tmp = clip(array_1[x, y], 2, 10)
tmp = tmp * a + array_2[x, y] * b
result_view[x, y] = tmp + c
return result
import numpy as np
def clip(a, min_value, max_value):
return min(max(a, min_value), max_value)
def compute(array_1, array_2, a, b, c):
"""
This function must implement the formula
np.clip(array_1, 2, 10) * a + array_2 * b + c
array_1 and array_2 are 2D.
"""
x_max = array_1.shape[0]
y_max = array_1.shape[1]
assert array_1.shape == array_2.shape
result = np.zeros((x_max, y_max), dtype=array_1.dtype)
for x in range(x_max):
for y in range(y_max):
tmp = clip(array_1[x, y], 2, 10)
tmp = tmp * a + array_2[x, y] * b
result[x, y] = tmp + c
return result
...@@ -5,49 +5,46 @@ import numpy as np ...@@ -5,49 +5,46 @@ import numpy as np
# type info object. # type info object.
DTYPE = np.intc DTYPE = np.intc
def naive_convolve(f, g): # cdef means here that this function is a plain C function (so faster).
if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1: # To get all the benefits, we type the arguments and the return value.
raise ValueError("Only odd dimensions on filter supported") cdef int clip(int a, int min_value, int max_value):
assert f.dtype == DTYPE and g.dtype == DTYPE return min(max(a, min_value), max_value)
def compute(array_1, array_2, int a, int b, int c):
# The "cdef" keyword is also used within functions to type variables. It # The "cdef" keyword is also used within functions to type variables. It
# can only be used at the top indentation level (there are non-trivial # can only be used at the top indentation level (there are non-trivial
# problems with allowing them in other places, though we'd love to see # problems with allowing them in other places, though we'd love to see
# good and thought out proposals for it). # good and thought out proposals for it).
cdef Py_ssize_t x_max = array_1.shape[0]
cdef Py_ssize_t y_max = array_1.shape[1]
assert array_1.shape == array_2.shape
assert array_1.dtype == DTYPE
assert array_2.dtype == DTYPE
result = np.zeros((x_max, y_max), dtype=DTYPE)
# Py_ssize_t is the proper C type for Python array indices.
cdef Py_ssize_t x, y, s, t, v, w, s_from, s_to, t_from, t_to
cdef Py_ssize_t vmax = f.shape[0]
cdef Py_ssize_t wmax = f.shape[1]
cdef Py_ssize_t smax = g.shape[0]
cdef Py_ssize_t tmax = g.shape[1]
cdef Py_ssize_t smid = smax // 2
cdef Py_ssize_t tmid = tmax // 2
cdef Py_ssize_t xmax = vmax + 2*smid
cdef Py_ssize_t ymax = wmax + 2*tmid
h = np.zeros([xmax, ymax], dtype=DTYPE)
# It is very important to type ALL your variables. You do not get any # It is very important to type ALL your variables. You do not get any
# warnings if not, only much slower code (they are implicitly typed as # warnings if not, only much slower code (they are implicitly typed as
# Python objects). # Python objects).
# For the value variable, we want to use the same data type as is # For the "tmp" variable, we want to use the same data type as is
# stored in the array, so we use int because it correspond to np.intc. # stored in the array, so we use int because it correspond to np.intc.
# NB! An important side-effect of this is that if "value" overflows its # NB! An important side-effect of this is that if "tmp" overflows its
# datatype size, it will simply wrap around like in C, rather than raise # datatype size, it will simply wrap around like in C, rather than raise
# an error like in Python. # an error like in Python.
cdef int value
for x in range(xmax): cdef int tmp
for y in range(ymax):
# Cython has built-in C functions for min and max. # Py_ssize_t is the proper C type for Python array indices.
# This makes the following lines very fast. cdef Py_ssize_t x, y
s_from = max(smid - x, -smid)
s_to = min((xmax - x) - smid, smid + 1) for x in range(x_max):
t_from = max(tmid - y, -tmid) for y in range(y_max):
t_to = min((ymax - y) - tmid, tmid + 1)
value = 0 tmp = clip(array_1[x, y], 2, 10)
for s in range(s_from, s_to): tmp = tmp * a + array_2[x, y] * b
for t in range(t_from, t_to): result[x, y] = tmp + c
v = x - smid + s
w = y - tmid + t return result
value += g[smid - s, tmid - t] * f[v, w]
h[x, y] = value
return h
\ No newline at end of file
# cython: infer_types=True
import numpy as np
cimport cython
ctypedef fused my_type:
int
double
long
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef naive_convolve(my_type [:,:] f, my_type [:,:] g):
if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
raise ValueError("Only odd dimensions on filter supported")
vmax = f.shape[0]
wmax = f.shape[1]
smax = g.shape[0]
tmax = g.shape[1]
smid = smax // 2
tmid = tmax // 2
xmax = vmax + 2*smid
ymax = wmax + 2*tmid
if my_type is int:
dtype = np.intc
elif my_type is double:
dtype = np.double
else:
dtype = np.long
h_np = np.zeros([xmax, ymax], dtype=dtype)
cdef my_type [:,:] h = h_np
cdef my_type value
for x in range(xmax):
for y in range(ymax):
s_from = max(smid - x, -smid)
s_to = min((xmax - x) - smid, smid + 1)
t_from = max(tmid - y, -tmid)
t_to = min((ymax - y) - tmid, tmid + 1)
value = 0
for s in range(s_from, s_to):
for t in range(t_from, t_to):
v = x - smid + s
w = y - tmid + t
value += g[smid - s, tmid - t] * f[v, w]
h[x, y] = value
return h_np
\ No newline at end of file
# cython: infer_types=True
import numpy as np
cimport cython
DTYPE = np.intc
@cython.boundscheck(False)
@cython.wraparound(False)
def naive_convolve(int [:,::1] f, int [:,::1] g):
if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
raise ValueError("Only odd dimensions on filter supported")
vmax = f.shape[0]
wmax = f.shape[1]
smax = g.shape[0]
tmax = g.shape[1]
smid = smax // 2
tmid = tmax // 2
xmax = vmax + 2*smid
ymax = wmax + 2*tmid
h_np = np.zeros([xmax, ymax], dtype=DTYPE)
cdef int [:,::1] h = h_np
cdef int value
for x in range(xmax):
for y in range(ymax):
s_from = max(smid - x, -smid)
s_to = min((xmax - x) - smid, smid + 1)
t_from = max(tmid - y, -tmid)
t_to = min((ymax - y) - tmid, tmid + 1)
value = 0
for s in range(s_from, s_to):
for t in range(t_from, t_to):
v = x - smid + s
w = y - tmid + t
value += g[smid - s, tmid - t] * f[v, w]
h[x, y] = value
return h_np
\ No newline at end of file
import numpy as np
DTYPE = np.intc
# It is possible to declare types in the function declaration.
def naive_convolve(int [:,:] f, int [:,:] g):
if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
raise ValueError("Only odd dimensions on filter supported")
# We don't need to check for the type of NumPy array here because
# a check is already performed when calling the function.
cdef Py_ssize_t x, y, s, t, v, w, s_from, s_to, t_from, t_to
cdef Py_ssize_t vmax = f.shape[0]
cdef Py_ssize_t wmax = f.shape[1]
cdef Py_ssize_t smax = g.shape[0]
cdef Py_ssize_t tmax = g.shape[1]
cdef Py_ssize_t smid = smax // 2
cdef Py_ssize_t tmid = tmax // 2
cdef Py_ssize_t xmax = vmax + 2*smid
cdef Py_ssize_t ymax = wmax + 2*tmid
h_np = np.zeros([xmax, ymax], dtype=DTYPE)
cdef int [:,:] h = h_np
cdef int value
for x in range(xmax):
for y in range(ymax):
s_from = max(smid - x, -smid)
s_to = min((xmax - x) - smid, smid + 1)
t_from = max(tmid - y, -tmid)
t_to = min((ymax - y) - tmid, tmid + 1)
value = 0
for s in range(s_from, s_to):
for t in range(t_from, t_to):
v = x - smid + s
w = y - tmid + t
value += g[smid - s, tmid - t] * f[v, w]
h[x, y] = value
return h_np
\ No newline at end of file
from __future__ import division
import numpy as np
def naive_convolve(f, g):
# f is an image and is indexed by (v, w)
# g is a filter kernel and is indexed by (s, t),
# it needs odd dimensions
# h is the output image and is indexed by (x, y),
# it is not cropped
if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
raise ValueError("Only odd dimensions on filter supported")
# smid and tmid are number of pixels between the center pixel
# and the edge, ie for a 5x5 filter they will be 2.
#
# The output size is calculated by adding smid, tmid to each
# side of the dimensions of the input image.
vmax = f.shape[0]
wmax = f.shape[1]
smax = g.shape[0]
tmax = g.shape[1]
smid = smax // 2
tmid = tmax // 2
xmax = vmax + 2*smid
ymax = wmax + 2*tmid
# Allocate result image.
h = np.zeros([xmax, ymax], dtype=f.dtype)
# Do convolution
for x in range(xmax):
for y in range(ymax):
# Calculate pixel value for h at (x,y). Sum one component
# for each pixel (s, t) of the filter g.
s_from = max(smid - x, -smid)
s_to = min((xmax - x) - smid, smid + 1)
t_from = max(tmid - y, -tmid)
t_to = min((ymax - y) - tmid, tmid + 1)
value = 0
for s in range(s_from, s_to):
for t in range(t_from, t_to):
v = x - smid + s
w = y - tmid + t
value += g[smid - s, tmid - t] * f[v, w]
h[x, y] = value
return h
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -138,43 +138,52 @@ Python by using a normal ``import yourmod`` statement. ...@@ -138,43 +138,52 @@ Python by using a normal ``import yourmod`` statement.
The first Cython program The first Cython program
========================== ==========================
The code below does 2D discrete convolution of an image with a filter (and I'm You can easily execute the code of this tutorial by
sure you can do better!, let it serve for demonstration purposes). It is both downloading `the Jupyter notebook <https://github.com/cython/cython/blob/master/docs/examples/userguide/numpy_tutorial/numpy_cython.ipynb>`_.
valid Python and valid Cython code. I'll refer to it as both
:file:`convolve_py.py` for the Python version and :file:`convolve_cy.pyx` for the
Cython version -- Cython uses ".pyx" as its file suffix.
.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_py.py The code below does the equivalent of this function in numpy::
:linenos:
This should be compiled to produce :file:`convolve_cy.so` (for Linux systems). We def compute_np(array_1, array_2, a, b, c):
return np.clip(array_1, 2, 10) * a + array_2 * b + c
We'll say that ``array_1`` and ``array_2`` are 2D NumPy arrays of integer type and
``a``, ``b`` and ``c`` are three Python integers.
This function uses NumPy and is already really fast, so it might be a bit overkill
to do it again with Cython. This is for demonstration purposes. Nonetheless, we
will show that we achieve a better speed and memory efficiency than NumPy at the cost of more verbosity.
This code present the function with the loops over the two dimensions being unrolled.
It is both valid Python and valid Cython code. I'll refer to it as both
:file:`compute_py.py` for the Python version and :file:`compute_cy.pyx` for the
Cython version -- Cython uses ``.pyx`` as its file suffix.
.. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_py.py
This should be compiled to produce :file:`compute_cy.so` (for Linux systems). We
run a Python session to test both the Python version (imported from run a Python session to test both the Python version (imported from
``.py``-file) and the compiled Cython module. ``.py``-file) and the compiled Cython module.
.. sourcecode:: ipython .. sourcecode:: ipython
In [1]: import numpy as np In [1]: import numpy as np
In [2]: import convolve_py In [2]: array_1 = np.random.uniform(0, 1000, size=(1000, 2000)).astype(np.intc)
In [3]: convolve_py.naive_convolve(np.array([[1, 1, 1]], dtype=np.int), In [3]: array_2 = np.random.uniform(0, 1000, size=(1000, 2000)).astype(np.intc)
... np.array([[1],[2],[1]], dtype=np.int)) In [4]: a = 4
Out [3]: In [5]: b = 3
array([[1, 1, 1], In [6]: c = 9
[2, 2, 2], In [7]: def compute_np(array_1, array_2, a, b, c):
[1, 1, 1]]) ...: return np.clip(array_1, 2, 10) * a + array_2 * b + c
In [4]: import convolve_cy In [8]: %timeit compute_np(array_1, array_2, a, b, c)
In [4]: convolve_cy.naive_convolve(np.array([[1, 1, 1]], dtype=np.int), 8.69 ms ± 297 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
... np.array([[1],[2],[1]], dtype=np.int))
Out [4]: In [9]: import compute_py
array([[1, 1, 1], In [10]: compute_py.compute(array_1, array_2, a, b, c)
[2, 2, 2], 25.6 s ± 225 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[1, 1, 1]])
In [11]: N = 600 In [11]: import compute_cy
In [12]: f = np.arange(N*N, dtype=np.int).reshape((N,N)) In [12]: compute_cy.compute(array_1, array_2, a, b, c)
In [13]: g = np.arange(81, dtype=np.int).reshape((9, 9)) 21.9 s ± 398 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [19]: %timeit convolve_py.naive_convolve(f, g)
16 s ± 70.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [20]: %timeit convolve_cy.naive_convolve(f, g)
13.5 s ± 99.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
There's not such a huge difference yet; because the C code still does exactly There's not such a huge difference yet; because the C code still does exactly
what the Python interpreter does (meaning, for instance, that a new object is what the Python interpreter does (meaning, for instance, that a new object is
...@@ -183,7 +192,7 @@ allocated for each number used). ...@@ -183,7 +192,7 @@ allocated for each number used).
You can look at the Python interaction and the generated C You can look at the Python interaction and the generated C
code by using ``-a`` when calling Cython from the command code by using ``-a`` when calling Cython from the command
line, ``%%cython -a`` when using a Jupyter Notebook, or by using line, ``%%cython -a`` when using a Jupyter Notebook, or by using
``cythonize('convolve_cy.pyx', annotate=True)`` when using a ``setup.py``. ``cythonize('compute_cy.pyx', annotate=True)`` when using a ``setup.py``.
Look at the generated html file and see what Look at the generated html file and see what
is needed for even the simplest statements. You get the point quickly. We need is needed for even the simplest statements. You get the point quickly. We need
to give Cython more information; we need to add types. to give Cython more information; we need to add types.
...@@ -192,44 +201,46 @@ Adding types ...@@ -192,44 +201,46 @@ Adding types
============= =============
To add types we use custom Cython syntax, so we are now breaking Python source To add types we use custom Cython syntax, so we are now breaking Python source
compatibility. Here's :file:`convolve_typed.pyx`. *Read the comments!* compatibility. Here's :file:`compute_typed.pyx`. *Read the comments!*
.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_typed.pyx .. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_typed.pyx
:linenos:
.. figure:: convolve_types_html.png .. figure:: compute_typed_html.jpg
At this point, have a look at the generated C code for :file:`convolve_cy.pyx` and At this point, have a look at the generated C code for :file:`compute_cy.pyx` and
:file:`convolve_typed.pyx`. Click on the lines to expand them and see corresponding C. :file:`compute_typed.pyx`. Click on the lines to expand them and see corresponding C.
Especially have a look at the ``for-loops``: In :file:`convolve_cy.c`, these are ~20 lines Especially have a look at the ``for-loops``: In :file:`compute_cy.c`, these are ~20 lines
of C code to set up while in :file:`convolve_typed.c` a normal C for loop is used. of C code to set up while in :file:`compute_typed.c` a normal C for loop is used.
After building this and continuing my (very informal) benchmarks, I get: After building this and continuing my (very informal) benchmarks, I get:
.. sourcecode:: ipython .. sourcecode:: ipython
In [22]: %timeit convolve_typed.naive_convolve(f, g) In [13]: %timeit compute_typed.compute(array_1, array_2, a, b, c)
55.8 s ± 199 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) 10.5 s ± 301 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
So in the end, adding types make the Cython code slower? So adding types does make the code faster, but nowhere
near the speed of NumPy?
What happened is that most of the time spend in this code is spent on line What happened is that most of the time spend in this code is spent those lines,
54. :: and those lines are slower to execute than in pure Python::
value += g[smid - s, tmid - t] * f[v, w] tmp = clip(array_1[x, y], 2, 10)
tmp = tmp * a + array_2[x, y] * b
result[x, y] = tmp + c
So what made this line so much slower than in the pure Python version? So what made those line so much slower than in the pure Python version?
``g`` and ``f`` are still NumPy arrays, so Python objects, and expect ``array_1`` and ``array_2`` are still NumPy arrays, so Python objects, and expect
Python integers as indexes. Here we pass C int values. So every time Python integers as indexes. Here we pass C int values. So every time
Cython reaches this line, it has to convert all the C integers to Python Cython reaches this line, it has to convert all the C integers to Python
int objects. Since this line is called very often, it outweighs the speed int objects. Since this line is called very often, it outweighs the speed
benefits of the pure C loops that were created from the ``range()`` earlier. benefits of the pure C loops that were created from the ``range()`` earlier.
Furthermore, ``g[smid - s, tmid - t] * f[v, w]`` returns a Python integer Furthermore, ``tmp * a + array_2[x, y] * b`` returns a Python integer
and ``value`` is a C integer, so Cython has to do type conversions again. and ``tmp`` is a C integer, so Cython has to do type conversions again.
In the end those types conversions add up. And made our convolution really In the end those types conversions add up. And made our computation really
slow. But this problem can be solved easily by using memoryviews. slow. But this problem can be solved easily by using memoryviews.
Efficient indexing with memoryviews Efficient indexing with memoryviews
...@@ -262,25 +273,25 @@ Here is how to declare a memoryview of integers:: ...@@ -262,25 +273,25 @@ Here is how to declare a memoryview of integers::
No data is copied from the NumPy array to the memoryview in our example. No data is copied from the NumPy array to the memoryview in our example.
As the name implies, it is only a "view" of the memory. So we can use the As the name implies, it is only a "view" of the memory. So we can use the
view ``h`` for efficient indexing and at the end return the real NumPy view ``result_view`` for efficient indexing and at the end return the real NumPy
array ``h_np`` that holds the data that we operated on. array ``result`` that holds the data that we operated on.
Here is how to use them in our code: Here is how to use them in our code:
:file:`convolve_memview.pyx` :file:`compute_memview.pyx`
.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_memview.pyx .. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_memview.pyx
:linenos:
Let's see how much faster accessing is now. Let's see how much faster accessing is now.
.. sourcecode:: ipython .. sourcecode:: ipython
In [22]: %timeit convolve_memview.naive_convolve(f, g) In [22]: %timeit compute_memview.compute(array_1, array_2, a, b, c)
57.1 ms ± 268 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) 9.56 ms ± 139 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Note the importance of this change. Note the importance of this change.
We're now 280 times faster than an interpreted version of Python. We're now 2700 times faster than an interpreted version of Python and close
to NumPy speed.
Memoryviews can be used with slices too, or even Memoryviews can be used with slices too, or even
with Python arrays. Check out the :ref:`memoryview page <memoryviews>` to with Python arrays. Check out the :ref:`memoryview page <memoryviews>` to
...@@ -296,13 +307,13 @@ The array lookups are still slowed down by two factors: ...@@ -296,13 +307,13 @@ The array lookups are still slowed down by two factors:
explicitly coded so that it doesn't use negative indices, and it explicitly coded so that it doesn't use negative indices, and it
(hopefully) always access within bounds. (hopefully) always access within bounds.
With decorators, we can deactivate those checks:: With decorators, we can deactivate those checks::
... ...
cimport cython cimport cython
@cython.boundscheck(False) # Deactivate bounds checking @cython.boundscheck(False) # Deactivate bounds checking
@cython.wraparound(False) # Deactivate negative indexing. @cython.wraparound(False) # Deactivate negative indexing.
def naive_convolve(int [:, :] f, int [:, :] g): def compute(int[:, :] array_1, int[:, :] array_2, int a, int b, int c):
... ...
Now bounds checking is not performed (and, as a side-effect, if you ''do'' Now bounds checking is not performed (and, as a side-effect, if you ''do''
...@@ -314,15 +325,18 @@ information. ...@@ -314,15 +325,18 @@ information.
.. sourcecode:: ipython .. sourcecode:: ipython
In [23]: %timeit convolve_index.naive_convolve(f, g) In [23]: %timeit compute_index.compute(array_1, array_2, a, b, c)
19.8 ms ± 299 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 6.1 ms ± 103 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
We're now 800 times faster than the interpreted Python version. We're now faster than the NumPy version. NumPy is really well written,
but does not performs operation lazily, meaning a lot
of back and forth in memory. Our version is very memory efficient and
cache friendly because we know the operations in advance.
.. Warning:: .. Warning::
Speed comes with some cost. Especially it can be dangerous to set typed Speed comes with some cost. Especially it can be dangerous to set typed
objects (like ``f``, ``g`` and ``h`` in our sample code) to ``None``. objects (like ``array_1``, ``array_2`` and ``result_view`` in our sample code) to ``None``.
Setting such objects to ``None`` is entirely legal, but all you can do with them Setting such objects to ``None`` is entirely legal, but all you can do with them
is check whether they are None. All other use (attribute lookup or indexing) is check whether they are None. All other use (attribute lookup or indexing)
can potentially segfault or corrupt data (rather than raising exceptions as can potentially segfault or corrupt data (rather than raising exceptions as
...@@ -349,8 +363,9 @@ you have to declare the memoryview like this:: ...@@ -349,8 +363,9 @@ you have to declare the memoryview like this::
cdef int [::1, :, :] a cdef int [::1, :, :] a
If all this makes no sense to you, you can skip this part, the performance gains are If all this makes no sense to you, you can skip this part, declaring
not that important. If you still want to understand what contiguous arrays are arrays as contiguous constrain the usage of your function.
If you still want to understand what contiguous arrays are
all about, you can see `this answer on StackOverflow all about, you can see `this answer on StackOverflow
<https://stackoverflow.com/questions/26998223/what-is-the-difference-between-contiguous-and-non-contiguous-arrays>`_. <https://stackoverflow.com/questions/26998223/what-is-the-difference-between-contiguous-and-non-contiguous-arrays>`_.
...@@ -359,10 +374,10 @@ get by declaring the memoryviews as contiguous: ...@@ -359,10 +374,10 @@ get by declaring the memoryviews as contiguous:
.. sourcecode:: ipython .. sourcecode:: ipython
In [23]: %timeit convolve_contiguous.naive_convolve(f, g) In [23]: %timeit compute_contiguous.compute(array_1, array_2, a, b, c)
21.3 ms ± 489 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) 4.13 ms ± 87.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
We're still around 800 times faster than the interpreted Python version. We're now around two times faster than the NumPy version.
Making the function cleaner Making the function cleaner
=========================== ===========================
...@@ -376,22 +391,21 @@ Note that since type declarations must happen at the top indentation level, ...@@ -376,22 +391,21 @@ Note that since type declarations must happen at the top indentation level,
Cython won't infer the type of variables declared for the first time Cython won't infer the type of variables declared for the first time
in other indentation levels. It would change too much the meaning of in other indentation levels. It would change too much the meaning of
our code. This is why, we must still declare manually the type of the our code. This is why, we must still declare manually the type of the
``value`` variable. ``tmp``, ``x`` and ``y`` variable.
And actually, manually giving the type of the ``value`` variable will And actually, manually giving the type of the ``tmp`` variable will
be useful when using fused types. be useful when using fused types.
.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_infer_types.pyx .. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_infer_types.pyx
:linenos:
We now do a speed test: We now do a speed test:
.. sourcecode:: ipython .. sourcecode:: ipython
In [24]: %timeit convolve_infer_types.naive_convolve(f, g) In [24]: %timeit compute_infer_types.compute(array_1, array_2, a, b, c)
21.3 ms ± 344 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) 4.1 ms ± 54.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
We're still around 800 times faster than the interpreted Python version. Lo and behold, the speed has not changed.
More generic code More generic code
================== ==================
...@@ -416,31 +430,52 @@ know what NumPy data type we should use for our output array. ...@@ -416,31 +430,52 @@ know what NumPy data type we should use for our output array.
In this case, our function now works for ints, doubles and floats. In this case, our function now works for ints, doubles and floats.
.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_fused_types.pyx .. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_fused_types.pyx
:linenos:
We can check that the output type is the right one:: We can check that the output type is the right one::
>>>naive_convolve_fused_types(f, g).dtype >>>compute(array_1, array_2, a, b, c).dtype
dtype('int32') dtype('int32')
>>>naive_convolve_fused_types(f.astype(np.double), g.astype(np.double)).dtype >>>compute(array_1.astype(np.double), array_2.astype(np.double), a, b, c).dtype
dtype('float64') dtype('float64')
We now do a speed test: We now do a speed test:
.. sourcecode:: ipython .. sourcecode:: ipython
In [25]: %timeit convolve_fused_types.naive_convolve(f, g) In [25]: %timeit compute_fused_types.compute(array_1, array_2, a, b, c)
20 ms ± 392 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) 6 ms ± 70.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
We're a bit slower than before, because of the right call to the clip function
must be found at runtime and adds a bit of overhead.
Using multiple threads
======================
Cython have support for OpenMP. It have also some nice wrappers around it,
like the function :func:`prange`. You can see more information about Cython and
parralelism in :ref:`parallel`. Since we do elementwise operations, we can easily
distribute the work among multiple threads. It's important not to forget to pass the
correct arguments to the compiler to enable OpenMP. When using the Jupyter notebook,
you should use the cell magic like this::
%%cython --compile-args=-fopenmp --link-args=-fopenmp --force
The GIL must be released (see :ref:`Releasing the GIL <nogil>`), so this is why we
declare our :func:`clip` function ``nogil``.
.. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_prange.pyx
We can have substantial speed gains for minimal effort:
.. sourcecode:: ipython
We're still around 800 times faster than the interpreted Python version. In [25]: %timeit compute_prange.compute(array_1, array_2, a, b, c)
3.41 ms ± 93.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Where to go from here? Where to go from here?
====================== ======================
* Since there is no Python interaction in the loops, it is possible with Cython
to release the GIL and use multiple cores easily. To learn how to do that,
you can see :ref:`using parallelism in Cython <parallel>`.
* If you want to learn how to make use of `BLAS <http://www.netlib.org/blas/>`_ * If you want to learn how to make use of `BLAS <http://www.netlib.org/blas/>`_
or `LAPACK <http://www.netlib.org/lapack/>`_ with Cython, you can watch or `LAPACK <http://www.netlib.org/lapack/>`_ with Cython, you can watch
`the presentation of Ian Henriksen at SciPy 2015 `the presentation of Ian Henriksen at SciPy 2015
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment