[fix] Rewrite 'urlretrieve' helper to fix various download-related issues

- Py3: stop using legacy API of urllib.request and fix download of http(s) URLs containing user:passwd@ - Py2: avoid OOM when downloading huge files This is implemented as a method in case we want to make it configurable via [buildout].

[fix] Rewrite 'urlretrieve' helper to fix various download-related issues
- Py3: stop using legacy API of urllib.request and fix download of http(s) URLs containing user:passwd@ - Py2: avoid OOM when downloading huge files This is implemented as a method in case we want to make it configurable via [buildout].
10a5bc94 · Julien Muchembled · Xavier Thompson · 914bb340 · 10a5bc94 · 10a5bc94
Commit 10a5bc94 authored Dec 29, 2021 by Julien Muchembled Committed by Xavier Thompson Mar 13, 2024
4 changed files
--- a/src/zc/buildout/__init__.py
+++ b/src/zc/buildout/__init__.py
@@ -49,3 +49,16 @@ class UserError(Exception):

    def __str__(self):
        return " ".join(map(str, self.args))
+
+# Used for Python 2-3 compatibility
+if str is bytes:
+    bytes2str = str2bytes = lambda s: s
+    def unicode2str(s):
+        return s.encode('utf-8')
+else:
+    def bytes2str(s):
+        return s.decode()
+    def str2bytes(s):
+        return s.encode()
+    def unicode2str(s):
+        return s
--- a/src/zc/buildout/download.py
+++ b/src/zc/buildout/download.py
@@ -20,35 +20,17 @@ except ImportError:

 try:
    # Python 3
-    from urllib.request import urlretrieve
-    from urllib.parse import urlparse
+    from urllib.request import Request, urlopen
+    from urllib.parse import urlparse, urlunparse
 except ImportError:
    # Python 2
-    import base64
    from urlparse import urlparse
    from urlparse import urlunparse
-    import urllib2
-
-    def urlretrieve(url, tmp_path):
-        """Work around Python issue 24599 including basic auth support
-        """
-        scheme, netloc, path, params, query, frag = urlparse(url)
-        auth, host = urllib2.splituser(netloc)
-        if auth:
-            url = urlunparse((scheme, host, path, params, query, frag))
-            req = urllib2.Request(url)
-            base64string = base64.encodestring(auth)[:-1]
-            basic = "Basic " + base64string
-            req.add_header("Authorization", basic)
-        else:
-            req = urllib2.Request(url)
-        url_obj = urllib2.urlopen(req)
-        with open(tmp_path, 'wb') as fp:
-            fp.write(url_obj.read())
-        return tmp_path, url_obj.info()
-
+    from urllib2 import Request, urlopen

 from zc.buildout.easy_install import realpath
+from base64 import b64encode
+from contextlib import closing
 import logging
 import os
 import os.path
@@ -56,6 +38,7 @@ import re
 import shutil
 import tempfile
 import zc.buildout
+from . import bytes2str, str2bytes
 from .rmtree import rmtree


@@ -225,13 +208,14 @@ class Download(object):
                nc.get('signature-certificate-list'), md5sum):
                # Download from original url if not cached or md5sum doesn't match.
                try:
-                    tmp_path, headers = urlretrieve(url, tmp_path)
+                    tmp_path, headers = self.urlretrieve(url, tmp_path)
                except HTTPError:
                    if not alternate_url:
                        raise
                    self.logger.info('using alternate URL: %s', alternate_url)
                    download_url = alternate_url
-                    tmp_path, headers = urlretrieve(alternate_url, tmp_path)
+                    tmp_path, headers = self.urlretrieve(
+                        alternate_url, tmp_path)
                if not check_md5sum(tmp_path, md5sum):
                    raise ChecksumError(
                        'MD5 checksum mismatch downloading %r' % download_url)
@@ -284,6 +268,22 @@ class Download(object):
            url_host, url_port = parsed[-2:]
            return '%s:%s' % (url_host, url_port)

+    def urlretrieve(self, url, tmp_path):
+        parsed_url = urlparse(url)
+        req = url
+        if parsed_url.scheme in ('http', 'https'):
+            auth_host = parsed_url.netloc.rsplit('@', 1)
+            if len(auth_host) > 1:
+                auth = auth_host[0]
+                url = parsed_url._replace(netloc=auth_host[1]).geturl()
+                req = Request(url)
+                req.add_header("Authorization",
+                               "Basic " + bytes2str(b64encode(str2bytes(auth))))
+        with closing(urlopen(req)) as src:
+            with open(tmp_path, 'wb') as dst:
+                shutil.copyfileobj(src, dst)
+            return tmp_path, src.info()
+

 def check_md5sum(path, md5sum):
    """Tell whether the MD5 checksum of the file at path matches.

--- a/src/zc/buildout/testing.py
+++ b/src/zc/buildout/testing.py
@@ -23,6 +23,7 @@ except ImportError:
    from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
    from urllib2        import urlopen

+import base64
 import errno
 import logging
 import multiprocessing
@@ -415,6 +416,23 @@ class Handler(BaseHTTPRequestHandler):
            self.__server.__log = False
            return k()

+        if self.path.startswith('/private/'):
+            auth = self.headers.get('Authorization')
+            if auth and auth.startswith('Basic ') and \
+                self.path[9:].encode() == base64.b64decode(
+                    self.headers.get('Authorization')[6:]):
+                return k()
+            # But not returning 401+WWW-Authenticate, we check that the client
+            # skips auth challenge, which is not free (in terms of performance)
+            # and useless for what we support.
+            self.send_response(403, 'Forbidden')
+            out = '<html><body>Forbidden</body></html>'.encode()
+            self.send_header('Content-Length', str(len(out)))
+            self.send_header('Content-Type', 'text/html')
+            self.end_headers()
+            self.wfile.write(out)
+            return
+
        path = os.path.abspath(os.path.join(self.tree, *self.path.split('/')))
        if not (
            ((path == self.tree) or path.startswith(self.tree+os.path.sep))

--- a/src/zc/buildout/tests/download.txt
+++ b/src/zc/buildout/tests/download.txt
@@ -152,6 +152,19 @@ This is a foo text.

 >>> remove(path)

+HTTP basic authentication:
+
+>>> download = Download()
+>>> user_url = server_url.replace('/localhost:', '/%s@localhost:') + 'private/'
+>>> path, is_temp = download(user_url % 'foo:' + 'foo:')
+>>> is_temp; remove(path)
+True
+>>> path, is_temp = download(user_url % 'foo:bar' + 'foo:bar')
+>>> is_temp; remove(path)
+True
+>>> download(user_url % 'bar:' + 'foo:')
+Traceback (most recent call last):
+UserError: Error downloading ...: HTTP Error 403: Forbidden

 Downloading using the download cache
 ------------------------------------