Commit 85e44d9a authored by Michael Tremer's avatar Michael Tremer

importer: Change download behaviour

The downloader used to open a connection to the web server hosting our
content which would have been decompressed (if necessary) on the fly and
also been parsed on the fly so that it could have been fed into the
database easily.

Some webservers do not seem to be patient enough to keep the connection
open if things take a little bit longer than usual. That caused the
import to fail.

This patch changes the behaviour that we would download all content
first, store it locally, and then start processing it.

Fixes: #12852
Signed-off-by: default avatarMichael Tremer <michael.tremer@ipfire.org>
Cc: Peter Müller <peter.mueller@ipfire.org>
parent aa23e03d
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
import gzip import gzip
import logging import logging
import tempfile
import urllib.request import urllib.request
# Initialise logging # Initialise logging
...@@ -106,75 +107,76 @@ class Downloader(object): ...@@ -106,75 +107,76 @@ class Downloader(object):
log.info("Using proxy %s" % url) log.info("Using proxy %s" % url)
self.proxy = url self.proxy = url
def request(self, url, data=None, return_blocks=False): def retrieve(self, url, data=None):
"""
This method will fetch the content at the given URL
and will return a file-object to a temporary file.
If the content was compressed, it will be decompressed on the fly.
"""
# Open a temporary file to buffer the downloaded content
t = tempfile.SpooledTemporaryFile(max_size=100 * 1024 * 1024)
# Create a new request
req = urllib.request.Request(url, data=data) req = urllib.request.Request(url, data=data)
# Configure proxy # Configure proxy
if self.proxy: if self.proxy:
req.set_proxy(self.proxy, "http") req.set_proxy(self.proxy, "http")
return DownloaderContext(self, req, return_blocks=return_blocks) log.info("Retrieving %s..." % req.full_url)
class DownloaderContext(object):
def __init__(self, downloader, request, return_blocks=False):
self.downloader = downloader
self.request = request
# Should we return one block or a single line?
self.return_blocks = return_blocks
# Save the response object
self.response = None
def __enter__(self):
log.info("Retrieving %s..." % self.request.full_url)
# Send request # Send request
self.response = urllib.request.urlopen(self.request) res = urllib.request.urlopen(req)
# Log the response headers # Log the response headers
log.debug("Response Headers:") log.debug("Response Headers:")
for header in self.headers: for header in res.headers:
log.debug(" %s: %s" % (header, self.get_header(header))) log.debug(" %s: %s" % (header, res.headers[header]))
return self # Write the payload to the temporary file
with res as f:
while True:
buf = f.read(65536)
if not buf:
break
def __exit__(self, type, value, traceback): t.write(buf)
pass
def __iter__(self): # Rewind the temporary file
""" t.seek(0)
Makes the object iterable by going through each block
"""
if self.return_blocks:
return iterate_over_blocks(self.body)
return iterate_over_lines(self.body) # Fetch the content type
content_type = res.headers.get("Content-Type")
@property # Decompress any gzipped response on the fly
def headers(self): if content_type in ("application/x-gzip", "application/gzip"):
if self.response: t = gzip.GzipFile(fileobj=t, mode="rb")
return self.response.headers
def get_header(self, name): # Return the temporary file handle
if self.headers: return t
return self.headers.get(name)
@property def request_blocks(self, url, data=None):
def body(self):
""" """
Returns a file-like object with the decoded content This method will fetch the data from the URL and return an
of the response. iterator for each block in the data.
""" """
content_type = self.get_header("Content-Type") # Download the data first
t = self.retrieve(url, data=data)
# Decompress any gzipped response on the fly # Then, split it into blocks
if content_type in ("application/x-gzip", "application/gzip"): return iterate_over_blocks(t)
return gzip.GzipFile(fileobj=self.response, mode="rb")
def request_lines(self, url, data=None):
"""
This method will fetch the data from the URL and return an
iterator for each line in the data.
"""
# Download the data first
t = self.retrieve(url, data=data)
# Return the response by default # Then, split it into lines
return self.response return iterate_over_lines(t)
def read_blocks(f): def read_blocks(f):
......
...@@ -435,9 +435,8 @@ class CLI(object): ...@@ -435,9 +435,8 @@ class CLI(object):
for source_key in location.importer.WHOIS_SOURCES: for source_key in location.importer.WHOIS_SOURCES:
for single_url in location.importer.WHOIS_SOURCES[source_key]: for single_url in location.importer.WHOIS_SOURCES[source_key]:
with downloader.request(single_url, return_blocks=True) as f: for block in downloader.request_blocks(single_url):
for block in f: self._parse_block(block, source_key, validcountries)
self._parse_block(block, source_key, validcountries)
# Process all parsed networks from every RIR we happen to have access to, # Process all parsed networks from every RIR we happen to have access to,
# insert the largest network chunks into the networks table immediately... # insert the largest network chunks into the networks table immediately...
...@@ -518,9 +517,8 @@ class CLI(object): ...@@ -518,9 +517,8 @@ class CLI(object):
for single_url in location.importer.EXTENDED_SOURCES[source_key]: for single_url in location.importer.EXTENDED_SOURCES[source_key]:
with self.db.transaction(): with self.db.transaction():
# Download data # Download data
with downloader.request(single_url) as f: for line in downloader.request_lines(single_url):
for line in f: self._parse_line(line, source_key, validcountries)
self._parse_line(line, source_key, validcountries)
# Download and import (technical) AS names from ARIN # Download and import (technical) AS names from ARIN
self._import_as_names_from_arin() self._import_as_names_from_arin()
...@@ -871,50 +869,46 @@ class CLI(object): ...@@ -871,50 +869,46 @@ class CLI(object):
# technical, not intended for human consumption, as description fields in # technical, not intended for human consumption, as description fields in
# organisation handles for other RIRs are - however, this is what we have got, # organisation handles for other RIRs are - however, this is what we have got,
# and in some cases, it might be still better than nothing) # and in some cases, it might be still better than nothing)
with downloader.request("https://ftp.arin.net/info/asn.txt", return_blocks=False) as f: for line in downloader.request_lines("https://ftp.arin.net/info/asn.txt"):
for line in f: # Valid lines start with a space, followed by the number of the Autonomous System ...
# Convert binary line to string... if not line.startswith(" "):
line = str(line) continue
# ... valid lines start with a space, followed by the number of the Autonomous System ...
if not line.startswith(" "):
continue
# Split line and check if there is a valid ASN in it... # Split line and check if there is a valid ASN in it...
asn, name = line.split()[0:2] asn, name = line.split()[0:2]
try: try:
asn = int(asn) asn = int(asn)
except ValueError: except ValueError:
log.debug("Skipping ARIN AS names line not containing an integer for ASN") log.debug("Skipping ARIN AS names line not containing an integer for ASN")
continue continue
# Filter invalid ASNs... # Filter invalid ASNs...
if not self._check_parsed_asn(asn): if not self._check_parsed_asn(asn):
continue continue
# Skip any AS name that appears to be a placeholder for a different RIR or entity... # Skip any AS name that appears to be a placeholder for a different RIR or entity...
if re.match(r"^(ASN-BLK|)(AFCONC|AFRINIC|APNIC|ASNBLK|LACNIC|RIPE|IANA)(?:\d?$|\-)", name): if re.match(r"^(ASN-BLK|)(AFCONC|AFRINIC|APNIC|ASNBLK|LACNIC|RIPE|IANA)(?:\d?$|\-)", name):
continue continue
# Bail out in case the AS name contains anything we do not expect here... # Bail out in case the AS name contains anything we do not expect here...
if re.search(r"[^a-zA-Z0-9-_]", name): if re.search(r"[^a-zA-Z0-9-_]", name):
log.debug("Skipping ARIN AS name for %s containing invalid characters: %s" % \ log.debug("Skipping ARIN AS name for %s containing invalid characters: %s" % \
(asn, name)) (asn, name))
# Things look good here, run INSERT statement and skip this one if we already have # Things look good here, run INSERT statement and skip this one if we already have
# a (better?) name for this Autonomous System... # a (better?) name for this Autonomous System...
self.db.execute(""" self.db.execute("""
INSERT INTO autnums( INSERT INTO autnums(
number, number,
name,
source
) VALUES (%s, %s, %s)
ON CONFLICT (number) DO NOTHING""",
asn,
name, name,
"ARIN", source
) ) VALUES (%s, %s, %s)
ON CONFLICT (number) DO NOTHING""",
asn,
name,
"ARIN",
)
def handle_update_announcements(self, ns): def handle_update_announcements(self, ns):
server = ns.server[0] server = ns.server[0]
...@@ -1262,8 +1256,11 @@ class CLI(object): ...@@ -1262,8 +1256,11 @@ class CLI(object):
downloader = location.importer.Downloader() downloader = location.importer.Downloader()
try: try:
with downloader.request("https://ip-ranges.amazonaws.com/ip-ranges.json", return_blocks=False) as f: # Fetch IP ranges
aws_ip_dump = json.load(f.body) f = downloader.retrieve("https://ip-ranges.amazonaws.com/ip-ranges.json")
# Parse downloaded file
aws_ip_dump = json.load(f)
except Exception as e: except Exception as e:
log.error("unable to preprocess Amazon AWS IP ranges: %s" % e) log.error("unable to preprocess Amazon AWS IP ranges: %s" % e)
return return
...@@ -1386,12 +1383,11 @@ class CLI(object): ...@@ -1386,12 +1383,11 @@ class CLI(object):
] ]
for url in ip_urls: for url in ip_urls:
try: # Fetch IP list
with downloader.request(url, return_blocks=False) as f: f = downloader.retrieve(url)
fcontent = f.body.readlines()
except Exception as e: # Split into lines
log.error("Unable to download Spamhaus DROP URL %s: %s" % (url, e)) fcontent = f.readlines()
return
# Conduct a very basic sanity check to rule out CDN issues causing bogus DROP # Conduct a very basic sanity check to rule out CDN issues causing bogus DROP
# downloads. # downloads.
...@@ -1408,7 +1404,6 @@ class CLI(object): ...@@ -1408,7 +1404,6 @@ class CLI(object):
# the override table in case they are valid... # the override table in case they are valid...
with self.db.transaction(): with self.db.transaction():
for sline in fcontent: for sline in fcontent:
# The response is assumed to be encoded in UTF-8... # The response is assumed to be encoded in UTF-8...
sline = sline.decode("utf-8") sline = sline.decode("utf-8")
...@@ -1443,18 +1438,13 @@ class CLI(object): ...@@ -1443,18 +1438,13 @@ class CLI(object):
) )
for url in asn_urls: for url in asn_urls:
try: # Fetch URL
with downloader.request(url, return_blocks=False) as f: f = downloader.retrieve(url)
fcontent = f.body.readlines()
except Exception as e:
log.error("Unable to download Spamhaus DROP URL %s: %s" % (url, e))
return
# Iterate through every line, filter comments and add remaining ASNs to # Iterate through every line, filter comments and add remaining ASNs to
# the override table in case they are valid... # the override table in case they are valid...
with self.db.transaction(): with self.db.transaction():
for sline in fcontent: for sline in t.readlines():
# The response is assumed to be encoded in UTF-8... # The response is assumed to be encoded in UTF-8...
sline = sline.decode("utf-8") sline = sline.decode("utf-8")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment