Commit 6d611970 authored by Alain Takoudjou's avatar Alain Takoudjou

[feat] Try to download gitlab private raw files from gitlab API

If fail to download raw file, check if possible to make download from
API, this is possible is username and password is provided from URL
username should be PRIVATE-TOKEN (this is the username for private token)
parent fb45b3e5
...@@ -50,6 +50,7 @@ setup( ...@@ -50,6 +50,7 @@ setup(
'setuptools>=38.2.3', 'setuptools>=38.2.3',
'pip', 'pip',
'wheel', 'wheel',
'lxml',
], ],
include_package_data = True, include_package_data = True,
entry_points = entry_points, entry_points = entry_points,
......
...@@ -22,12 +22,13 @@ try: ...@@ -22,12 +22,13 @@ try:
# Python 3 # Python 3
from urllib.error import HTTPError from urllib.error import HTTPError
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse, quote, urlencode
except ImportError: except ImportError:
# Python 2 # Python 2
from urlparse import urlparse from urlparse import urlparse
from urlparse import urlunparse from urlparse import urlunparse
from urllib2 import HTTPError, Request, urlopen from urllib2 import HTTPError, Request, urlopen, quote
from urllib import urlencode
from zc.buildout.easy_install import realpath from zc.buildout.easy_install import realpath
from base64 import b64encode from base64 import b64encode
...@@ -43,7 +44,7 @@ import tempfile ...@@ -43,7 +44,7 @@ import tempfile
import zc.buildout import zc.buildout
from . import bytes2str, str2bytes from . import bytes2str, str2bytes
from .rmtree import rmtree from .rmtree import rmtree
from lxml.html import parse as lxmlparse
class netrc(netrc.netrc): class netrc(netrc.netrc):
...@@ -65,6 +66,9 @@ netrc = netrc() ...@@ -65,6 +66,9 @@ netrc = netrc()
class ChecksumError(zc.buildout.UserError): class ChecksumError(zc.buildout.UserError):
pass pass
class GitlabAccessDeniedError(zc.buildout.UserError):
pass
class Download(object): class Download(object):
"""Configurable download utility. """Configurable download utility.
...@@ -239,6 +243,13 @@ class Download(object): ...@@ -239,6 +243,13 @@ class Download(object):
self.logger.info('using alternate URL: %s', alternate_url) self.logger.info('using alternate URL: %s', alternate_url)
download_url = alternate_url download_url = alternate_url
self.urlretrieve(alternate_url, path) self.urlretrieve(alternate_url, path)
except GitlabAccessDeniedError:
header_dict, laburl = self._labraw_authproxy(url)
if len(header_dict.keys()) > 0:
# gitlab url, try from API
self.urlretrieve(laburl, path, headers=header_dict)
else:
raise
if not check_md5sum(path, md5sum): if not check_md5sum(path, md5sum):
raise ChecksumError('MD5 checksum mismatch downloading %r' raise ChecksumError('MD5 checksum mismatch downloading %r'
% download_url) % download_url)
...@@ -284,15 +295,59 @@ class Download(object): ...@@ -284,15 +295,59 @@ class Download(object):
if auth: if auth:
return '{0}:{2}'.format(*auth), url return '{0}:{2}'.format(*auth), url
def urlretrieve(self, url, tmp_path): def _labraw_authproxy(self, url): # -> url'
header_dict = {}
# url should be https://XXX.YYY/namespace/project/[-/]raw/....
if not re.match(r"https://[\w\-_\.\:\@\+]+/([\.\w\-\+_]+/[\.\w\-\+_]+/(-/){0,1}raw/)", url):
return header_dict, url
p = urlparse(url)
pathv = p.path.split('/')
if p.username == "PRIVATE-TOKEN" and p.password:
header_dict["PRIVATE-TOKEN"] = p.password
repo = '/'.join(pathv[1:3])
# FIXME this does not support refs like y/bstr.
# To support this we will need to do what
# https://lab.nexedi.com/nexedi/gitlab-workhorse/commit/5b8cf10e
# was doing - try to extract all variants for ref from longest to
# shortest and stop on the first variant thay yields good result.
if pathv[3] == '-': # the url is like .../-/raw/...
ref = pathv[5]
filepath = '/'.join(pathv[6:])
else:
ref = pathv[4]
filepath = '/'.join(pathv[5:])
qrepo = quote(repo, '')
qfilepath = quote(filepath, '')
path = '/api/v4/projects/%s/repository/files/%s/raw' % (qrepo, qfilepath)
query = urlencode({'ref': ref})
netloc = '%s:%s' % (p.hostname, p.port) if p.port else p.hostname
return header_dict, urlunparse((p.scheme, netloc, path, p.params, query, p.fragment))
def urlretrieve(self, url, tmp_path, headers={}):
auth = self._auth(url) auth = self._auth(url)
if auth: if auth:
req = Request(auth[1]) req = Request(auth[1])
req.add_header("Authorization", req.add_header("Authorization",
"Basic " + bytes2str(b64encode(str2bytes(auth[0])))) "Basic " + bytes2str(b64encode(str2bytes(auth[0]))))
else: else:
req = url req = Request(url)
for k, v in headers.items():
req.add_header(k, v)
with closing(urlopen(req)) as src: with closing(urlopen(req)) as src:
# Is this a gitlab raw URL ?
# Gitlab return to sign in page with code 200 if authentication failed.
if re.match(r"https://[\w\-_\.\:\@\+]+/([\.\w\-\+_]+/[\.\w\-\+_]+/(-/){0,1}raw/)", url):
parsed = lxmlparse(src)
page_title = parsed.find(".//title")
if page_title is not None and page_title.text.startswith("Sign in"):
# the content is gitlab Sign in page
raise GitlabAccessDeniedError("You have been redirected to Sign in page")
with open(tmp_path, 'wb') as dst: with open(tmp_path, 'wb') as dst:
shutil.copyfileobj(src, dst) shutil.copyfileobj(src, dst)
return tmp_path, src.info() return tmp_path, src.info()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment