[feat] Try to download gitlab private raw files from gitlab API

If fail to download raw file, check if possible to make download from API, this is possible is username and password is provided from URL username should be PRIVATE-TOKEN (this is the username for private token)

[feat] Try to download gitlab private raw files from gitlab API
If fail to download raw file, check if possible to make download from API, this is possible is username and password is provided from URL username should be PRIVATE-TOKEN (this is the username for private token)
6d611970 · Alain Takoudjou · fb45b3e5 · 6d611970 · 6d611970
Commit 6d611970 authored Sep 04, 2024 by Alain Takoudjou
Hide whitespace changes
Inline Side-by-side

Showing with 61 additions and 5 deletions

setup.py setup.py +1 -0

src/zc/buildout/download.py src/zc/buildout/download.py +60 -5

No files found.
--- a/setup.py
+++ b/setup.py
@@ -50,6 +50,7 @@ setup(
        'setuptools>=38.2.3',
        'pip',
        'wheel',
+        'lxml',
    ],
    include_package_data = True,
    entry_points = entry_points,

--- a/src/zc/buildout/download.py
+++ b/src/zc/buildout/download.py
@@ -22,12 +22,13 @@ try:
    # Python 3
    from urllib.error import HTTPError
    from urllib.request import Request, urlopen
-    from urllib.parse import urlparse, urlunparse
+    from urllib.parse import urlparse, urlunparse, quote, urlencode
 except ImportError:
    # Python 2
    from urlparse import urlparse
    from urlparse import urlunparse
-    from urllib2 import HTTPError, Request, urlopen
+    from urllib2 import HTTPError, Request, urlopen, quote
+    from urllib import urlencode
 from zc.buildout.easy_install import realpath
 from base64 import b64encode
@@ -43,7 +44,7 @@ import tempfile
 import zc.buildout
 from . import bytes2str, str2bytes
 from .rmtree import rmtree
+from lxml.html import parse as lxmlparse
 class netrc(netrc.netrc):
@@ -65,6 +66,9 @@ netrc = netrc()
 class ChecksumError(zc.buildout.UserError):
    pass
+class GitlabAccessDeniedError(zc.buildout.UserError):
+    pass
 class Download(object):
    """Configurable download utility.
@@ -239,6 +243,13 @@ class Download(object):
                self.logger.info('using alternate URL: %s', alternate_url)
                download_url = alternate_url
                self.urlretrieve(alternate_url, path)
+            except GitlabAccessDeniedError:
+                header_dict, laburl  = self._labraw_authproxy(url)
+                if len(header_dict.keys()) > 0:
+                    # gitlab url, try from API
+                    self.urlretrieve(laburl, path, headers=header_dict)
+                else:
+                    raise
            if not check_md5sum(path, md5sum):
                raise ChecksumError('MD5 checksum mismatch downloading %r'
                                    % download_url)
@@ -284,15 +295,59 @@ class Download(object):
            if auth:
                return '{0}:{2}'.format(*auth), url
-    def urlretrieve(self, url, tmp_path):
+    def _labraw_authproxy(self, url): # -> url'
+        header_dict = {}
+        # url should be https://XXX.YYY/namespace/project/[-/]raw/....
+        if not re.match(r"https://[\w\-_\.\:\@\+]+/([\.\w\-\+_]+/[\.\w\-\+_]+/(-/){0,1}raw/)", url):
+            return header_dict, url
+        p = urlparse(url)
+        pathv = p.path.split('/')
+        if p.username == "PRIVATE-TOKEN" and p.password:
+            header_dict["PRIVATE-TOKEN"] = p.password
+        repo = '/'.join(pathv[1:3])
+        # FIXME this does not support refs like y/bstr.
+        # To support this we will need to do what
+        # https://lab.nexedi.com/nexedi/gitlab-workhorse/commit/5b8cf10e
+        # was doing - try to extract all variants for ref from longest to
+        # shortest and stop on the first variant thay yields good result.
+        if pathv[3] == '-': # the url is like .../-/raw/...
+          ref  = pathv[5]
+          filepath = '/'.join(pathv[6:])
+        else:
+          ref  = pathv[4]
+          filepath = '/'.join(pathv[5:])
+        qrepo     = quote(repo, '')
+        qfilepath = quote(filepath, '')
+        path  = '/api/v4/projects/%s/repository/files/%s/raw' % (qrepo, qfilepath)
+        query = urlencode({'ref': ref})
+        netloc = '%s:%s' % (p.hostname, p.port) if p.port else p.hostname
+        return header_dict, urlunparse((p.scheme, netloc, path, p.params, query, p.fragment))
+    def urlretrieve(self, url, tmp_path, headers={}):
        auth = self._auth(url)
        if auth:
            req = Request(auth[1])
            req.add_header("Authorization",
                           "Basic " + bytes2str(b64encode(str2bytes(auth[0]))))
        else:
-            req = url
+            req = Request(url)
+        for k, v in headers.items():
+            req.add_header(k, v)
        with closing(urlopen(req)) as src:
+            # Is this a gitlab raw URL ?
+            # Gitlab return to sign in page with code 200 if authentication failed.
+            if re.match(r"https://[\w\-_\.\:\@\+]+/([\.\w\-\+_]+/[\.\w\-\+_]+/(-/){0,1}raw/)", url):
+                parsed = lxmlparse(src)
+                page_title = parsed.find(".//title")
+                if page_title is not None and page_title.text.startswith("Sign in"):
+                    # the content is gitlab Sign in page
+                    raise GitlabAccessDeniedError("You have been redirected to Sign in page")
            with open(tmp_path, 'wb') as dst:
                shutil.copyfileobj(src, dst)
            return tmp_path, src.info()