Commit 6a9722bb authored by Jérome Perrin's avatar Jérome Perrin

download: support more hash algorithms, not just md5sum

TODO: see XXX comments in the doctest

This changes to use hashlib directly, the module was introduced in python2.5
we don't need to support older versions.
parent 50e0e3b7
...@@ -66,6 +66,7 @@ except ImportError: ...@@ -66,6 +66,7 @@ except ImportError:
from zc.buildout.easy_install import realpath from zc.buildout.easy_install import realpath
import hashlib
import logging import logging
import os import os
import os.path import os.path
...@@ -122,7 +123,7 @@ class Download(object): ...@@ -122,7 +123,7 @@ class Download(object):
if self.download_cache is not None: if self.download_cache is not None:
return os.path.join(self.download_cache, self.namespace or '') return os.path.join(self.download_cache, self.namespace or '')
def __call__(self, url, md5sum=None, path=None): def __call__(self, url, md5sum=None, path=None, hashes=None):
"""Download a file according to the utility's configuration. """Download a file according to the utility's configuration.
url: URL to download url: URL to download
...@@ -133,13 +134,13 @@ class Download(object): ...@@ -133,13 +134,13 @@ class Download(object):
""" """
if self.cache: if self.cache:
local_path, is_temp = self.download_cached(url, md5sum) local_path, is_temp = self.download_cached(url, md5sum, hashes)
else: else:
local_path, is_temp = self.download(url, md5sum, path) local_path, is_temp = self.download(url, md5sum, path, hashes)
return locate_at(local_path, path), is_temp return locate_at(local_path, path), is_temp
def download_cached(self, url, md5sum=None): def download_cached(self, url, md5sum=None, hashes=None):
"""Download a file from a URL using the cache. """Download a file from a URL using the cache.
This method assumes that the cache has been configured. Optionally, it This method assumes that the cache has been configured. Optionally, it
...@@ -164,7 +165,7 @@ class Download(object): ...@@ -164,7 +165,7 @@ class Download(object):
is_temp = False is_temp = False
if self.fallback: if self.fallback:
try: try:
_, is_temp = self.download(url, md5sum, cached_path) _, is_temp = self.download(url, md5sum, cached_path, hashes)
except ChecksumError: except ChecksumError:
raise raise
except Exception: except Exception:
...@@ -174,15 +175,19 @@ class Download(object): ...@@ -174,15 +175,19 @@ class Download(object):
raise ChecksumError( raise ChecksumError(
'MD5 checksum mismatch for cached download ' 'MD5 checksum mismatch for cached download '
'from %r at %r' % (url, cached_path)) 'from %r at %r' % (url, cached_path))
if not check_integrity(cached_path, hashes):
raise ChecksumError(
'Checksum mismatch for cached download '
'from %r at %r' % (url, cached_path))
self.logger.debug('Using cache file %s' % cached_path) self.logger.debug('Using cache file %s' % cached_path)
else: else:
self.logger.debug('Cache miss; will cache %s as %s' % self.logger.debug('Cache miss; will cache %s as %s' %
(url, cached_path)) (url, cached_path))
_, is_temp = self.download(url, md5sum, cached_path) _, is_temp = self.download(url, md5sum, cached_path, hashes)
return cached_path, is_temp return cached_path, is_temp
def download(self, url, md5sum=None, path=None): def download(self, url, md5sum=None, path=None, hashes=None):
"""Download a file from a URL to a given or temporary path. """Download a file from a URL to a given or temporary path.
An online resource is always downloaded to a temporary file and moved An online resource is always downloaded to a temporary file and moved
...@@ -204,6 +209,10 @@ class Download(object): ...@@ -204,6 +209,10 @@ class Download(object):
raise ChecksumError( raise ChecksumError(
'MD5 checksum mismatch for local resource at %r.' % 'MD5 checksum mismatch for local resource at %r.' %
url_path) url_path)
if not check_integrity(url_path, hashes):
raise ChecksumError(
'Checksum mismatch for local resource at %r.' %
url_path)
return locate_at(url_path, path), False return locate_at(url_path, path), False
if self.offline: if self.offline:
...@@ -225,6 +234,10 @@ class Download(object): ...@@ -225,6 +234,10 @@ class Download(object):
if not check_md5sum(tmp_path, md5sum): if not check_md5sum(tmp_path, md5sum):
raise ChecksumError( raise ChecksumError(
'MD5 checksum mismatch downloading %r' % url) 'MD5 checksum mismatch downloading %r' % url)
if not check_integrity(tmp_path, hashes):
raise ChecksumError(
'Checksum mismatch downloading %r' % url)
# Upload the file to network cache. # Upload the file to network cache.
if nc.get('upload-cache-url') and nc.get('upload-dir-url'): if nc.get('upload-cache-url') and nc.get('upload-dir-url'):
upload_network_cached( upload_network_cached(
...@@ -280,25 +293,46 @@ class Download(object): ...@@ -280,25 +293,46 @@ class Download(object):
return '%s:%s' % (url_host, url_port) return '%s:%s' % (url_host, url_port)
def check_md5sum(path, md5sum): def check_integrity(path, hashes):
"""Tell whether the MD5 checksum of the file at path matches. """Tell wether the checksum of the file at path matches any of the hashes.
No checksum being given is considered a match. The hashes is string following format `algorithm:hash`, or None.
Multiple hashes can be specified, by separating them by spaces. In that
case, having one hash matching is enough.
hashes being None is considered a match.
""" """
if md5sum is None: if hashes is None:
return True return True
for algorithm_and_expected_hash in hashes.split():
algorithm, expected_hash = algorithm_and_expected_hash.split(':', 1)
f = open(path, 'rb') f = open(path, 'rb')
checksum = md5() checksum = hashlib.new(algorithm)
try: try:
chunk = f.read(2**16) chunk = f.read(2**16)
while chunk: while chunk:
checksum.update(chunk) checksum.update(chunk)
chunk = f.read(2**16) chunk = f.read(2**16)
return checksum.hexdigest() == md5sum if checksum.hexdigest() == expected_hash:
return True
finally: finally:
f.close() f.close()
return False
def check_md5sum(path, md5sum):
"""Tell whether the MD5 checksum of the file at path matches.
No checksum being given is considered a match.
"""
if md5sum is None:
return True
return check_integrity(path, 'md5:' + md5sum)
def remove(path): def remove(path):
......
...@@ -69,19 +69,56 @@ the local file itself: ...@@ -69,19 +69,56 @@ the local file itself:
>>> download(join(server_data, 'foo.txt')) >>> download(join(server_data, 'foo.txt'))
('/sample_files/foo.txt', False) ('/sample_files/foo.txt', False)
We can also have the downloaded file's MD5 sum checked: We can also have the downloaded file's integrity checked:
>>> import hashlib
>>> path, is_temp = download(server_url+'foo.txt',
... hashes='sha256:%s' % hashlib.sha256('This is a foo text.'.encode()).hexdigest())
>>> is_temp
True
>>> remove(path)
>>> download(server_url+'foo.txt',
... hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: Checksum mismatch downloading 'http://localhost/foo.txt'
We can specify multiple hashes, as long as one match, the download is satisfied.
XXX not sure it makes sense to expose this here ...
XXX but then hashe*s* is a bad name - maybe integrity ?
>>> path, is_temp = download(
... server_url+'foo.txt',
... hashes='sha512:1234567-wrong-hash sha256:%s' % hashlib.sha256('This is a foo text.'.encode()).hexdigest())
>>> is_temp
True
>>> remove(path)
The error message in the event of an checksum mismatch for a local file
reads somewhat differently:
>>> download(join(server_data, 'foo.txt'),
... hashes='sha256:%s' % hashlib.sha256('This is a foo text.'.encode()).hexdigest())
('/sample_files/foo.txt', False)
>>> download(join(server_data, 'foo.txt'),
... hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: Checksum mismatch for local resource at '/sample_files/foo.txt'.
>>> try: from hashlib import md5
... except ImportError: from md5 import new as md5 This also support MD5 sum for legacy:
>>> path, is_temp = download(server_url+'foo.txt', >>> path, is_temp = download(server_url+'foo.txt',
... md5('This is a foo text.'.encode()).hexdigest()) ... hashlib.md5('This is a foo text.'.encode()).hexdigest())
>>> is_temp >>> is_temp
True True
>>> remove(path) >>> remove(path)
>>> download(server_url+'foo.txt', >>> download(server_url+'foo.txt',
... md5('The wrong text.'.encode()).hexdigest()) ... hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last): Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt' ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
...@@ -89,11 +126,11 @@ The error message in the event of an MD5 checksum mismatch for a local file ...@@ -89,11 +126,11 @@ The error message in the event of an MD5 checksum mismatch for a local file
reads somewhat differently: reads somewhat differently:
>>> download(join(server_data, 'foo.txt'), >>> download(join(server_data, 'foo.txt'),
... md5('This is a foo text.'.encode()).hexdigest()) ... hashlib.md5('This is a foo text.'.encode()).hexdigest())
('/sample_files/foo.txt', False) ('/sample_files/foo.txt', False)
>>> download(join(server_data, 'foo.txt'), >>> download(join(server_data, 'foo.txt'),
... md5('The wrong text.'.encode()).hexdigest()) ... hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last): Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch for local resource at '/sample_files/foo.txt'. ChecksumError: MD5 checksum mismatch for local resource at '/sample_files/foo.txt'.
...@@ -165,10 +202,17 @@ the file on the server to see this: ...@@ -165,10 +202,17 @@ the file on the server to see this:
>>> cat(path) >>> cat(path)
This is a foo text. This is a foo text.
If we specify an MD5 checksum for a file that is already in the cache, the If we specify hashes for a file that is already in the cache, the
cached copy's checksum will be verified: cached copy's checksum will be verified:
>>> download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest()) >>> download(server_url+'foo.txt', hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: Checksum mismatch for cached download
from 'http://localhost/foo.txt' at '/download-cache/foo.txt'
Same for legacy MD5 checksums:
>>> download(server_url+'foo.txt', hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last): Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch for cached download ChecksumError: MD5 checksum mismatch for cached download
from 'http://localhost/foo.txt' at '/download-cache/foo.txt' from 'http://localhost/foo.txt' at '/download-cache/foo.txt'
...@@ -247,7 +291,14 @@ This is a foo text. ...@@ -247,7 +291,14 @@ This is a foo text.
However, resources with checksum mismatches will not be copied to the cache: However, resources with checksum mismatches will not be copied to the cache:
>>> download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest()) >>> download(server_url+'foo.txt', hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: Checksum mismatch downloading 'http://localhost/foo.txt'
>>> ls(cache)
Same for legay MD5 checksum:
>>> download(server_url+'foo.txt', hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last): Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt' ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
>>> ls(cache) >>> ls(cache)
...@@ -347,7 +398,7 @@ the test is run, so we don't actually know the full URL of the file. Let's ...@@ -347,7 +398,7 @@ the test is run, so we don't actually know the full URL of the file. Let's
check that the checksum actually belongs to the particular URL used: check that the checksum actually belongs to the particular URL used:
>>> (path.lower() == >>> (path.lower() ==
... join(cache, md5((server_url+'foo.txt').encode()).hexdigest()).lower()) ... join(cache, hashlib.md5((server_url+'foo.txt').encode()).hexdigest()).lower())
True True
The cached copy is used when downloading the file again: The cached copy is used when downloading the file again:
...@@ -370,7 +421,7 @@ cache under a different name: ...@@ -370,7 +421,7 @@ cache under a different name:
>>> path == path2 >>> path == path2
False False
>>> (path2.lower() == >>> (path2.lower() ==
... join(cache, md5((server_url+'other/foo.txt').encode()).hexdigest() ... join(cache, hashlib.md5((server_url+'other/foo.txt').encode()).hexdigest()
... ).lower()) ... ).lower())
True True
>>> cat(path) >>> cat(path)
...@@ -451,12 +502,19 @@ When trying to download a resource whose checksum does not match, the cached ...@@ -451,12 +502,19 @@ When trying to download a resource whose checksum does not match, the cached
copy will neither be used nor overwritten: copy will neither be used nor overwritten:
>>> write(server_data, 'foo.txt', 'This is a foo text.') >>> write(server_data, 'foo.txt', 'This is a foo text.')
>>> download(server_url+'foo.txt', md5('The wrong text.'.encode()).hexdigest()) >>> download(server_url+'foo.txt', hashes='sha256:%s' % hashlib.sha256('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last): Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt' ChecksumError: Checksum mismatch downloading 'http://localhost/foo.txt'
>>> cat(cache, 'foo.txt') >>> cat(cache, 'foo.txt')
The wrong text. The wrong text.
This is also the case with legacy MD5
>>> download(server_url+'foo.txt', hashlib.md5('The wrong text.'.encode()).hexdigest())
Traceback (most recent call last):
ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
>>> cat(cache, 'foo.txt')
The wrong text.
Configuring the download utility from buildout options Configuring the download utility from buildout options
------------------------------------------------------ ------------------------------------------------------
...@@ -538,7 +596,7 @@ True ...@@ -538,7 +596,7 @@ True
Regressions Regressions
----------- -----------
MD5 checksum calculation needs to be reliable on all supported systems, which Checksum calculation needs to be reliable on all supported systems, which
requires text files to be treated as binary to avoid implicit line-ending requires text files to be treated as binary to avoid implicit line-ending
conversions: conversions:
...@@ -547,7 +605,10 @@ conversions: ...@@ -547,7 +605,10 @@ conversions:
>>> _ = f.write(text.encode()) >>> _ = f.write(text.encode())
>>> f.close() >>> f.close()
>>> path, is_temp = Download()(server_url+'foo.txt', >>> path, is_temp = Download()(server_url+'foo.txt',
... md5(text.encode()).hexdigest()) ... hashlib.md5(text.encode()).hexdigest())
>>> remove(path)
>>> path, is_temp = Download()(server_url+'foo.txt',
... hashes='sha512:%s' % hashlib.sha512(text.encode()).hexdigest())
>>> remove(path) >>> remove(path)
When "downloading" a directory given by file-system path or ``file:`` URL and When "downloading" a directory given by file-system path or ``file:`` URL and
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment