importer: Change download behaviour

The downloader used to open a connection to the web server hosting our content which would have been decompressed (if necessary) on the fly and also been parsed on the fly so that it could have been fed into the database easily. Some webservers do not seem to be patient enough to keep the connection open if things take a little bit longer than usual. That caused the import to fail. This patch changes the behaviour that we would download all content first, store it locally, and then start processing it. Fixes: #12852 Signed-off-by: Michael Tremer <michael.tremer@ipfire.org> Cc: Peter Müller <peter.mueller@ipfire.org>

importer: Change download behaviour
The downloader used to open a connection to the web server hosting our content which would have been decompressed (if necessary) on the fly and also been parsed on the fly so that it could have been fed into the database easily. Some webservers do not seem to be patient enough to keep the connection open if things take a little bit longer than usual. That caused the import to fail. This patch changes the behaviour that we would download all content first, store it locally, and then start processing it. Fixes: #12852 Signed-off-by: Michael Tremer <michael.tremer@ipfire.org> Cc: Peter Müller <peter.mueller@ipfire.org>
85e44d9a · Michael Tremer · aa23e03d · 85e44d9a · 85e44d9a
Commit 85e44d9a authored Aug 12, 2022 by Michael Tremer
Hide whitespace changes
Inline Side-by-side

Showing with 99 additions and 107 deletions

src/python/location/importer.py src/python/location/importer.py +49 -47

src/scripts/location-importer.in src/scripts/location-importer.in +50 -60

No files found.
--- a/src/python/location/importer.py
+++ b/src/python/location/importer.py
@@ -19,6 +19,7 @@
 import gzip
 import logging
+import tempfile
 import urllib.request
 # Initialise logging
@@ -106,75 +107,76 @@ class Downloader(object):
 		log.info("Using proxy %s" % url)
 		self.proxy = url
-	def request(self, url, data=None, return_blocks=False):
+	def retrieve(self, url, data=None):
+		"""
+			This method will fetch the content at the given URL
+			and will return a file-object to a temporary file.
+			If the content was compressed, it will be decompressed on the fly.
+		"""
+		# Open a temporary file to buffer the downloaded content
+		t = tempfile.SpooledTemporaryFile(max_size=100 * 1024 * 1024)
+		# Create a new request
 		req = urllib.request.Request(url, data=data)
 		# Configure proxy
 		if self.proxy:
 			req.set_proxy(self.proxy, "http")
-		return DownloaderContext(self, req, return_blocks=return_blocks)
+		log.info("Retrieving %s..." % req.full_url)
-class DownloaderContext(object):
-	def __init__(self, downloader, request, return_blocks=False):
-		self.downloader = downloader
-		self.request = request
-		# Should we return one block or a single line?
-		self.return_blocks = return_blocks
-		# Save the response object
-		self.response = None
-	def __enter__(self):
-		log.info("Retrieving %s..." % self.request.full_url)
 		# Send request
-		self.response = urllib.request.urlopen(self.request)
+		res = urllib.request.urlopen(req)
 		# Log the response headers
 		log.debug("Response Headers:")
-		for header in self.headers:
+		for header in res.headers:
-			log.debug("	%s: %s" % (header, self.get_header(header)))
+			log.debug("	%s: %s" % (header, res.headers[header]))
-		return self
+		# Write the payload to the temporary file
+		with res as f:
+			while True:
+				buf = f.read(65536)
+				if not buf:
+					break
-	def __exit__(self, type, value, traceback):
+				t.write(buf)
-		pass
-	def __iter__(self):
+		# Rewind the temporary file
-		"""
+		t.seek(0)
-			Makes the object iterable by going through each block
-		"""
-		if self.return_blocks:
-			return iterate_over_blocks(self.body)
-		return iterate_over_lines(self.body)
+		# Fetch the content type
+		content_type = res.headers.get("Content-Type")
-	@property
+		# Decompress any gzipped response on the fly
-	def headers(self):
+		if content_type in ("application/x-gzip", "application/gzip"):
-		if self.response:
+			t = gzip.GzipFile(fileobj=t, mode="rb")
-			return self.response.headers
-	def get_header(self, name):
+		# Return the temporary file handle
-		if self.headers:
+		return t
-			return self.headers.get(name)
-	@property
+	def request_blocks(self, url, data=None):
-	def body(self):
 		"""
-			Returns a file-like object with the decoded content
+			This method will fetch the data from the URL and return an
-			of the response.
+			iterator for each block in the data.
 		"""
-		content_type = self.get_header("Content-Type")
+		# Download the data first
+		t = self.retrieve(url, data=data)
-		# Decompress any gzipped response on the fly
+		# Then, split it into blocks
-		if content_type in ("application/x-gzip", "application/gzip"):
+		return iterate_over_blocks(t)
-			return gzip.GzipFile(fileobj=self.response, mode="rb")
+	def request_lines(self, url, data=None):
+		"""
+			This method will fetch the data from the URL and return an
+			iterator for each line in the data.
+		"""
+		# Download the data first
+		t = self.retrieve(url, data=data)
-		# Return the response by default
+		# Then, split it into lines
-		return self.response
+		return iterate_over_lines(t)
 def read_blocks(f):

--- a/src/scripts/location-importer.in
+++ b/src/scripts/location-importer.in
@@ -435,9 +435,8 @@ class CLI(object):
 			for source_key in location.importer.WHOIS_SOURCES:
 				for single_url in location.importer.WHOIS_SOURCES[source_key]:
-					with downloader.request(single_url, return_blocks=True) as f:
+					for block in downloader.request_blocks(single_url):
-						for block in f:
+						self._parse_block(block, source_key, validcountries)
-							self._parse_block(block, source_key, validcountries)
 			# Process all parsed networks from every RIR we happen to have access to,
 			# insert the largest network chunks into the networks table immediately...
@@ -518,9 +517,8 @@ class CLI(object):
 			for single_url in location.importer.EXTENDED_SOURCES[source_key]:
 				with self.db.transaction():
 					# Download data
-					with downloader.request(single_url) as f:
+					for line in downloader.request_lines(single_url):
-						for line in f:
+						self._parse_line(line, source_key, validcountries)
-							self._parse_line(line, source_key, validcountries)
 		# Download and import (technical) AS names from ARIN
 		self._import_as_names_from_arin()
@@ -871,50 +869,46 @@ class CLI(object):
 		# technical, not intended for human consumption, as description fields in
 		# organisation handles for other RIRs are - however, this is what we have got,
 		# and in some cases, it might be still better than nothing)
-		with downloader.request("https://ftp.arin.net/info/asn.txt", return_blocks=False) as f:
+		for line in downloader.request_lines("https://ftp.arin.net/info/asn.txt"):
-			for line in f:
+			# Valid lines start with a space, followed by the number of the Autonomous System ...
-				# Convert binary line to string...
+			if not line.startswith(" "):
-				line = str(line)
+				continue
-				# ... valid lines start with a space, followed by the number of the Autonomous System ...
-				if not line.startswith(" "):
-					continue
-				# Split line and check if there is a valid ASN in it...
+			# Split line and check if there is a valid ASN in it...
-				asn, name = line.split()[0:2]
+			asn, name = line.split()[0:2]
-				try:
+			try:
-					asn = int(asn)
+				asn = int(asn)
-				except ValueError:
+			except ValueError:
-					log.debug("Skipping ARIN AS names line not containing an integer for ASN")
+				log.debug("Skipping ARIN AS names line not containing an integer for ASN")
-					continue
+				continue
-				# Filter invalid ASNs...
+			# Filter invalid ASNs...
-				if not self._check_parsed_asn(asn):
+			if not self._check_parsed_asn(asn):
-					continue
+				continue
-				# Skip any AS name that appears to be a placeholder for a different RIR or entity...
+			# Skip any AS name that appears to be a placeholder for a different RIR or entity...
-				if re.match(r"^(ASN-BLK|)(AFCONC|AFRINIC|APNIC|ASNBLK|LACNIC|RIPE|IANA)(?:\d?$|\-)", name):
+			if re.match(r"^(ASN-BLK|)(AFCONC|AFRINIC|APNIC|ASNBLK|LACNIC|RIPE|IANA)(?:\d?$|\-)", name):
-					continue
+				continue
-				# Bail out in case the AS name contains anything we do not expect here...
+			# Bail out in case the AS name contains anything we do not expect here...
-				if re.search(r"[^a-zA-Z0-9-_]", name):
+			if re.search(r"[^a-zA-Z0-9-_]", name):
-					log.debug("Skipping ARIN AS name for %s containing invalid characters: %s" % \
+				log.debug("Skipping ARIN AS name for %s containing invalid characters: %s" % \
-							(asn, name))
+						(asn, name))
-				# Things look good here, run INSERT statement and skip this one if we already have
+			# Things look good here, run INSERT statement and skip this one if we already have
-				# a (better?) name for this Autonomous System...
+			# a (better?) name for this Autonomous System...
-				self.db.execute("""
+			self.db.execute("""
-					INSERT INTO autnums(
+				INSERT INTO autnums(
-						number,
+					number,
-						name,
-						source
-					) VALUES (%s, %s, %s)
-					ON CONFLICT (number) DO NOTHING""",
-					asn,
 					name,
-					"ARIN",
+					source
-				)
+				) VALUES (%s, %s, %s)
+				ON CONFLICT (number) DO NOTHING""",
+				asn,
+				name,
+				"ARIN",
+			)
 	def handle_update_announcements(self, ns):
 		server = ns.server[0]
@@ -1262,8 +1256,11 @@ class CLI(object):
 		downloader = location.importer.Downloader()
 		try:
-			with downloader.request("https://ip-ranges.amazonaws.com/ip-ranges.json", return_blocks=False) as f:
+			# Fetch IP ranges
-				aws_ip_dump = json.load(f.body)
+			f = downloader.retrieve("https://ip-ranges.amazonaws.com/ip-ranges.json")
+			# Parse downloaded file
+			aws_ip_dump = json.load(f)
 		except Exception as e:
 			log.error("unable to preprocess Amazon AWS IP ranges: %s" % e)
 			return
@@ -1386,12 +1383,11 @@ class CLI(object):
 				]
 		for url in ip_urls:
-			try:
+			# Fetch IP list
-				with downloader.request(url, return_blocks=False) as f:
+			f = downloader.retrieve(url)
-					fcontent = f.body.readlines()
-			except Exception as e:
+			# Split into lines
-				log.error("Unable to download Spamhaus DROP URL %s: %s" % (url, e))
+			fcontent = f.readlines()
-				return
 			# Conduct a very basic sanity check to rule out CDN issues causing bogus DROP
 			# downloads.
@@ -1408,7 +1404,6 @@ class CLI(object):
 			# the override table in case they are valid...
 			with self.db.transaction():
 				for sline in fcontent:
 					# The response is assumed to be encoded in UTF-8...
 					sline = sline.decode("utf-8")
@@ -1443,18 +1438,13 @@ class CLI(object):
 					)
 		for url in asn_urls:
-			try:
+			# Fetch URL
-				with downloader.request(url, return_blocks=False) as f:
+			f = downloader.retrieve(url)
-					fcontent = f.body.readlines()
-			except Exception as e:
-				log.error("Unable to download Spamhaus DROP URL %s: %s" % (url, e))
-				return
 			# Iterate through every line, filter comments and add remaining ASNs to
 			# the override table in case they are valid...
 			with self.db.transaction():
-				for sline in fcontent:
+				for sline in t.readlines():
 					# The response is assumed to be encoded in UTF-8...
 					sline = sline.decode("utf-8")