Final changes for 0.11.4

git-svn-id: http://svn.savannah.nongnu.org/svn/rdiff-backup@301 2b77aa54-bcbc-44c9-a7ec-4f6cf2b41109

Final changes for 0.11.4
git-svn-id: http://svn.savannah.nongnu.org/svn/rdiff-backup@301 2b77aa54-bcbc-44c9-a7ec-4f6cf2b41109
16c07bf4 · bescoto · 213a7c33 · 16c07bf4 · 16c07bf4 · 16c07bf4
Commit 16c07bf4 authored Mar 15, 2003 by bescoto
12 changed files
--- a/trunk/rdiff-backup/CHANGELOG
+++ b/trunk/rdiff-backup/CHANGELOG
-New in v0.11.4 (2003/04/01)
+New in v0.11.4 (2003/03/15)
 ---------------------------

 Fixed bug incrementing sockets whose filenames were pretty long, but
@@ -7,6 +7,14 @@ not super long.  Reported by Olivier Mueller.
 Added Albert Chin-A-Young's patch to add a few options to the setup.py
 install script.

+Apparently fixed rare utime type bug.  Thanks to Christian Skarby for
+report and testing.
+
+Added detailed file_statistics (in addition to session_statistics) as
+requested by Dean Gaudet.  Disable with --no-file-statistics option.
+
+Minor speed enhancements.
+

 New in v0.11.3 (2003/03/04)
 ---------------------------

--- a/trunk/rdiff-backup/TODO
+++ b/trunk/rdiff-backup/TODO
+
 ---------[ Medium term ]---------------------------------------

 Look at Kent Borg's suggestion for restore options and digests.
@@ -13,8 +14,6 @@ Add # of increments option to --remove-older-than

 Make argument shortcut for cstream

-Make --calculate-averages work with directory_statistics file.
-
 Write configuration file, to make sure settings like --quoting-char,
 --windows-time-format, etc., don't change between sessions,
 backup/restoring, etc.

--- a/trunk/rdiff-backup/rdiff-backup.1
+++ b/trunk/rdiff-backup/rdiff-backup.1
@@ -218,6 +218,11 @@ Do not compress increments based on files whose filenames match regexp.
 The default is
 "(?i).*\\.(gz|z|bz|bz2|tgz|zip|rpm|deb|jpg|gif|png|jp2|mp3|ogg|avi|wmv|mpeg|mpg|rm|mov)$"
 .TP
+.B --no-file-statistics
+This will disable writing to the file_statistics file in the
+rdiff-backup-data directory.  rdiff-backup will run slightly quicker
+and take up a bit less space.
+.TP
 .BI --no-hard-links
 Don't replicate hard links on destination side.  Note that because
 metadata is written to a separate file, hard link information will not

--- a/trunk/rdiff-backup/rdiff_backup/Globals.py
+++ b/trunk/rdiff-backup/rdiff_backup/Globals.py
@@ -148,6 +148,10 @@ ssh_compression = 1
 # If true, print statistics after successful backup
 print_statistics = None

+# Controls whether file_statistics file is written in
+# rdiff-backup-data dir.  These can sometimes take up a lot of space.
+file_statistics = 1
+
 # On the writer connection, the following will be set to the mirror
 # Select iterator.
 select_mirror = None

--- a/trunk/rdiff-backup/rdiff_backup/Main.py
+++ b/trunk/rdiff-backup/rdiff_backup/Main.py
@@ -53,13 +53,14 @@ def parse_cmdlineoptions(arglist):
 		  "include-globbing-filelist=", "include-regexp=",
 		  "list-changed-since=", "list-increments",
 		  "no-compare-inode", "no-compression",
-		  "no-compression-regexp=", "no-hard-links", "null-separator",
-		  "parsable-output", "print-statistics", "quoting-char=",
-		  "remote-cmd=", "remote-schema=", "remove-older-than=",
-		  "restore-as-of=", "restrict=", "restrict-read-only=",
-		  "restrict-update-only=", "server", "ssh-no-compression",
-		  "terminal-verbosity=", "test-server", "verbosity=",
-		  "version", "windows-mode", "windows-time-format"])
+		  "no-compression-regexp=", "no-file-statistics",
+		  "no-hard-links", "null-separator", "parsable-output",
+		  "print-statistics", "quoting-char=", "remote-cmd=",
+		  "remote-schema=", "remove-older-than=", "restore-as-of=",
+		  "restrict=", "restrict-read-only=", "restrict-update-only=",
+		  "server", "ssh-no-compression", "terminal-verbosity=",
+		  "test-server", "verbosity=", "version", "windows-mode",
+		  "windows-time-format"])
 	except getopt.error, e:
 		commandline_error("Bad commandline options: %s" % str(e))

@@ -108,6 +109,7 @@ def parse_cmdlineoptions(arglist):
 		elif opt == "--no-compression": Globals.set("compression", None)
 		elif opt == "--no-compression-regexp":
 			Globals.set("no_compression_regexp_string", arg)
+		elif opt == "--no-file-statistics": Globals.set('file_statistics', 0)
 		elif opt == "--no-hard-links": Globals.set('preserve_hardlinks', 0)
 		elif opt == "--null-separator": Globals.set("null_separator", 1)
 		elif opt == "--parsable-output": Globals.set('parsable_output', 1)

--- a/trunk/rdiff-backup/rdiff_backup/backup.py
+++ b/trunk/rdiff-backup/rdiff_backup/backup.py
@@ -237,14 +237,21 @@ class CacheCollatedPostProcess:
 		self.iter = collated_iter # generates (source_rorp, dest_rorp) pairs
 		self.cache_size = cache_size
 		self.statfileobj = statistics.init_statfileobj()
+		if Globals.file_statistics: statistics.FileStats.init()
 		metadata.OpenMetadata()

-		# the following should map indicies to lists [source_rorp,
-		# dest_rorp, changed_flag, success_flag] where changed_flag
-		# should be true if the rorps are different, and success_flag
-		# should be 1 if dest_rorp has been successfully updated to
-		# source_rorp, and 2 if the destination file is deleted
-		# entirely.  They both default to false (0).
+		# the following should map indicies to lists
+		# [source_rorp, dest_rorp, changed_flag, success_flag, increment]
+
+		# changed_flag should be true if the rorps are different, and
+
+		# success_flag should be 1 if dest_rorp has been successfully
+		# updated to source_rorp, and 2 if the destination file is
+		# deleted entirely.  They both default to false (0).
+		
+		# increment holds the RPath of the increment file if one
+		# exists.  It is used to record file statistics.
+		
 		self.cache_dict = {}
 		self.cache_indicies = []

@@ -255,7 +262,7 @@ class CacheCollatedPostProcess:
 		source_rorp, dest_rorp = self.iter.next()
 		self.pre_process(source_rorp, dest_rorp)
 		index = source_rorp and source_rorp.index or dest_rorp.index
-		self.cache_dict[index] = [source_rorp, dest_rorp, 0, 0]
+		self.cache_dict[index] = [source_rorp, dest_rorp, 0, 0, None]
 		self.cache_indicies.append(index)

 		if len(self.cache_indicies) > self.cache_size: self.shorten_cache()
@@ -276,15 +283,17 @@ class CacheCollatedPostProcess:
 		"""Remove one element from cache, possibly adding it to metadata"""
 		first_index = self.cache_indicies[0]
 		del self.cache_indicies[0]
-		old_source_rorp, old_dest_rorp, changed_flag, success_flag = \
+		old_source_rorp, old_dest_rorp, changed_flag, success_flag, inc = \
 						 self.cache_dict[first_index]
 		del self.cache_dict[first_index]
 		self.post_process(old_source_rorp, old_dest_rorp,
-						  changed_flag, success_flag)
+						  changed_flag, success_flag, inc)

-	def post_process(self, source_rorp, dest_rorp, changed, success):
+	def post_process(self, source_rorp, dest_rorp, changed, success, inc):
 		"""Post process source_rorp and dest_rorp.

+		The point of this is to write statistics and metadata.
+
 		changed will be true if the files have changed.  success will
 		be true if the files have been successfully updated (this is
 		always false for un-changed files).
@@ -294,12 +303,14 @@ class CacheCollatedPostProcess:
 			if source_rorp: self.statfileobj.add_source_file(source_rorp)
 			if dest_rorp: self.statfileobj.add_dest_file(dest_rorp)
 		if success == 0: metadata_rorp = dest_rorp
-		elif success == 1:
+		elif success == 1 or success == 2:
 			self.statfileobj.add_changed(source_rorp, dest_rorp)
 			metadata_rorp = source_rorp
 		else: metadata_rorp = None
 		if metadata_rorp and metadata_rorp.lstat():
 			metadata.WriteMetadata(metadata_rorp)
+		if Globals.file_statistics:
+			statistics.FileStats.update(source_rorp, dest_rorp, changed, inc)

 	def in_cache(self, index):
 		"""Return true if given index is cached"""
@@ -317,6 +328,10 @@ class CacheCollatedPostProcess:
 		"""Signal that the file with given index has changed"""
 		self.cache_dict[index][2] = 1

+	def set_inc(self, index, inc):
+		"""Set the increment of the current file"""
+		self.cache_dict[index][4] = inc
+
 	def get_rorps(self, index):
 		"""Retrieve (source_rorp, dest_rorp) from cache"""
 		return self.cache_dict[index][:2]
@@ -337,6 +352,7 @@ class CacheCollatedPostProcess:
 		while self.cache_indicies: self.shorten_cache()
 		metadata.CloseMetadata()
 		if Globals.print_statistics: statistics.print_active_stats()
+		if Globals.file_statistics: statistics.FileStats.close()
 		statistics.write_active_statfileobj()


@@ -511,6 +527,7 @@ class IncrementITRB(PatchITRB):
 		if self.patch_to_temp(rp, diff_rorp, tf):
 			inc = self.inc_with_checking(tf, rp, self.get_incrp(index))
 			if inc is not None:
+				self.CCPP.set_inc(index, inc)
 				if inc.isreg():
 					inc.fsync_with_dir() # Write inc before rp changed
 				if tf.lstat():
@@ -531,10 +548,12 @@ class IncrementITRB(PatchITRB):
 			inc = self.inc_with_checking(diff_rorp, base_rp,
 										 self.get_incrp(index))
 			if inc and inc.isreg():
-				inc.fsync_with_dir() # must writte inc before rp changed
+				inc.fsync_with_dir() # must write inc before rp changed
 			self.prepare_dir(diff_rorp, base_rp)
-		elif (self.set_dir_replacement(diff_rorp, base_rp) and
-			  self.inc_with_checking(self.dir_replacement, base_rp,
-									 self.get_incrp(index))):
-			self.CCPP.flag_success(index)
+		elif self.set_dir_replacement(diff_rorp, base_rp):
+			inc = self.inc_with_checking(self.dir_replacement, base_rp,
+										 self.get_incrp(index))
+			if inc:
+				self.CCPP.set_inc(index, inc)
+				self.CCPP.flag_success(index)

--- a/trunk/rdiff-backup/rdiff_backup/metadata.py
+++ b/trunk/rdiff-backup/rdiff_backup/metadata.py
@@ -104,26 +104,20 @@ def RORP2Record(rorpath):
 	str_list.append("  Permissions %s\n" % rorpath.getperms())
 	return "".join(str_list)

-line_parsing_regexp = re.compile("^ *([A-Za-z0-9]+) (.+)$")
+line_parsing_regexp = re.compile("^ *([A-Za-z0-9]+) (.+)$", re.M)
 def Record2RORP(record_string):
 	"""Given record_string, return RORPath

 	For speed reasons, write the RORPath data dictionary directly
-	instead of calling rorpath functions.  This depends on the 
+	instead of calling rorpath functions.  Profiling has shown this to
+	be a time critical function.

 	"""
 	data_dict = {}
-	index_list = [None] # put in list so we can modify using parse_line
-	def process_line(line):
-		"""Process given line, and modify data_dict or index_list"""
-		if not line: return # skip empty lines
-		match = line_parsing_regexp.search(line)
-		if not match: raise ParsingError("Bad line: '%s'" % line)
-		field, data = match.group(1), match.group(2)
-
+	for field, data in line_parsing_regexp.findall(record_string):
 		if field == "File":
-			if data == ".": index_list[0] = ()
-			else: index_list[0] = tuple(unquote_path(data).split("/"))
+			if data == ".": index = ()
+			else: index = tuple(unquote_path(data).split("/"))
 		elif field == "Type":
 			if data == "None": data_dict['type'] = None
 			else: data_dict['type'] = data
@@ -140,9 +134,7 @@ def Record2RORP(record_string):
 		elif field == "Gid": data_dict['gid'] = int(data)
 		elif field == "Permissions": data_dict['perms'] = int(data)
 		else: raise ParsingError("Unknown field in line '%s'" % line)
-		
-	map(process_line, record_string.split("\n"))
-	return rpath.RORPath(index_list[0], data_dict)
+	return rpath.RORPath(index, data_dict)

 chars_to_quote = re.compile("\\n|\\\\")
 def quote_path(path_string):
@@ -260,6 +252,7 @@ class rorp_extractor:

 metadata_rp = None
 metadata_fileobj = None
+metadata_record_buffer = [] # Use this because gzip writes are slow
 def OpenMetadata(rp = None, compress = 1):
 	"""Open the Metadata file for writing, return metadata fileobj"""
 	global metadata_rp, metadata_fileobj
@@ -274,13 +267,20 @@ def OpenMetadata(rp = None, compress = 1):

 def WriteMetadata(rorp):
 	"""Write metadata of rorp to file"""
-	global metadata_fileobj
-	metadata_fileobj.write(RORP2Record(rorp))
+	global metadata_fileobj, metadata_record_buffer
+	metadata_record_buffer.append(RORP2Record(rorp))
+	if len(metadata_record_buffer) >= 100: write_metadata_buffer()
+
+def write_metadata_buffer():
+	global metadata_record_buffer
+	metadata_fileobj.write("".join(metadata_record_buffer))
+	metadata_record_buffer = []

 def CloseMetadata():
 	"""Close the metadata file"""
 	global metadata_rp, metadata_fileobj
 	assert metadata_fileobj, "Metadata file not open"
+	if metadata_record_buffer: write_metadata_buffer()
 	try: fileno = metadata_fileobj.fileno() # will not work if GzipFile
 	except AttributeError: fileno = metadata_fileobj.fileobj.fileno()
 	os.fsync(fileno)

--- a/trunk/rdiff-backup/rdiff_backup/rpath.py
+++ b/trunk/rdiff-backup/rdiff_backup/rpath.py
@@ -440,8 +440,8 @@ class RORPath:

 	def getnumlinks(self):
 		"""Number of places inode is linked to"""
-		try: return self.data['nlink']
-		except KeyError: return 1
+		if self.data.has_key('nlink'): return self.data['nlink']
+		else: return 1

 	def readlink(self):
 		"""Wrapper around os.readlink()"""

--- a/trunk/rdiff-backup/rdiff_backup/statistics.py
+++ b/trunk/rdiff-backup/rdiff_backup/statistics.py
@@ -20,7 +20,7 @@
 """Generate and process aggregated backup information"""

 import re, os, time
-import Globals, robust, Time, rorpiter, increment, log
+import Globals, Time, increment, log, static

 class StatsException(Exception): pass

@@ -347,6 +347,69 @@ def print_active_stats():
 	"""Print statistics of active statobj to stdout and log"""
 	global _active_statfileobj
 	assert _active_statfileobj
+	_active_statfileobj.finish()
 	statmsg = _active_statfileobj.get_stats_logstring("Session statistics")
 	log.Log.log_to_file(statmsg)
 	Globals.client_conn.sys.stdout.write(statmsg)
+
+
+class FileStats:
+	"""Keep track of less detailed stats on file-by-file basis"""
+	_fileobj, _rp = None, None
+	_line_sep = None
+	def init(cls):
+		"""Open file stats object and prepare to write"""
+		assert not (cls._fileobj or cls._rp), (cls._fileobj, cls._rp)
+		rpbase = Globals.rbdir.append("file_statistics")
+		suffix = Globals.compression and 'data.gz' or 'data'
+		cls._rp = increment.get_inc(rpbase, suffix, Time.curtime)
+		assert not cls._rp.lstat()
+		cls._fileobj = cls._rp.open("wb", compress = Globals.compression)
+
+		cls._line_sep = Globals.null_separator and '\0' or '\n'
+		cls.write_docstring()
+		cls.line_buffer = []
+
+	def write_docstring(cls):
+		"""Write the first line (a documentation string) into file"""
+		cls._fileobj.write("# Format of each line in file statistics file:")
+		cls._fileobj.write(cls._line_sep)
+		cls._fileobj.write("# Filename Changed SourceSize MirrorSize "
+						   "IncrementSize" + cls._line_sep)
+
+	def update(cls, source_rorp, dest_rorp, changed, inc):
+		"""Update file stats with given information"""
+		if source_rorp: filename = source_rorp.get_indexpath()
+		else: filename = dest_rorp.get_indexpath()
+
+		size_list = map(cls.get_size, [source_rorp, dest_rorp, inc])
+		line = " ".join([filename, str(changed)] + size_list)
+		cls.line_buffer.append(line)
+		if len(cls.line_buffer) >= 100: cls.write_buffer()
+
+	def get_size(cls, rorp):
+		"""Return the size of rorp as string, or "NA" if not a regular file"""
+		if not rorp: return "NA"
+		if rorp.isreg(): return str(rorp.getsize())
+		else: return "0"
+
+	def write_buffer(cls):
+		"""Write buffer to file because buffer is full
+
+		The buffer part is necessary because the GzipFile.write()
+		method seems fairly slow.
+
+		"""
+		assert cls.line_buffer and cls._fileobj
+		cls.line_buffer.append('') # have join add _line_sep to end also
+		cls._fileobj.write(cls._line_sep.join(cls.line_buffer))
+		cls.line_buffer = []
+
+	def close(cls):
+		"""Close file stats file"""
+		assert cls._fileobj, cls._fileobj
+		if cls.line_buffer: cls.write_buffer()
+		assert not cls._fileobj.close()
+		cls._fileobj = cls._rp = None
+
+static.MakeClass(FileStats)
--- a/trunk/rdiff-backup/testing/metadatatest.py
+++ b/trunk/rdiff-backup/testing/metadatatest.py
@@ -8,7 +8,7 @@ class MetadataTest(unittest.TestCase):
 	def make_temp(self):
 		"""Make temp directory testfiles/output"""
 		global tempdir
-		tempdir.delete()
+		if tempdir.lstat(): tempdir.delete()
 		tempdir.mkdir()

 	def testQuote(self):

--- a/trunk/rdiff-backup/testing/statisticstest.py
+++ b/trunk/rdiff-backup/testing/statisticstest.py
@@ -180,26 +180,6 @@ class IncStatTest(unittest.TestCase):
 		rbdir = rpath.RPath(Globals.local_connection,
 							"testfiles/output/rdiff-backup-data")

-		#incs = Restore.get_inclist(rbdir.append("subdir").
-		#						   append("directory_statistics"))
-		#assert len(incs) == 2
-		#s1 = StatsObj().read_stats_from_rp(incs[0]) # initial mirror stats
-		#assert s1.SourceFiles == 2
-		#assert 400000 < s1.SourceFileSize < 420000
-		#self.stats_check_initial(s1)
-
-		#subdir_stats = StatsObj().read_stats_from_rp(incs[1]) # increment stats
-		#assert subdir_stats.SourceFiles == 2
-		#assert 400000 < subdir_stats.SourceFileSize < 420000
-		#assert subdir_stats.MirrorFiles == 2
-		#assert 400000 < subdir_stats.MirrorFileSize < 420000
-		#assert subdir_stats.NewFiles == subdir_stats.NewFileSize == 0
-		#assert subdir_stats.DeletedFiles == subdir_stats.DeletedFileSize == 0
-		#assert subdir_stats.ChangedFiles == 2
-		#assert 400000 < subdir_stats.ChangedSourceSize < 420000
-		#assert 400000 < subdir_stats.ChangedMirrorSize < 420000
-		#assert 10 < subdir_stats.IncrementFileSize < 20000
-
 		incs = restore.get_inclist(rbdir.append("session_statistics"))
 		assert len(incs) == 2
 		s2 = statistics.StatsObj().read_stats_from_rp(incs[0])
@@ -214,7 +194,7 @@ class IncStatTest(unittest.TestCase):
 		assert 700000 <= root_stats.MirrorFileSize < 750000
 		assert root_stats.NewFiles == 1
 		assert root_stats.NewFileSize == 0
-		assert root_stats.DeletedFiles == 1
+		assert root_stats.DeletedFiles == 1, root_stats.DeletedFiles
 		assert root_stats.DeletedFileSize == 200000
 		assert 3 <= root_stats.ChangedFiles <= 4, root_stats.ChangedFiles
 		assert 450000 <= root_stats.ChangedSourceSize < 470000

--- a/trunk/rdiff-backup/testing/test_with_profiling.py
+++ b/trunk/rdiff-backup/testing/test_with_profiling.py
+import profile, pstats
+from metadatatest import *
+
+profile.run("unittest.main()", "profile-output")
+p = pstats.Stats("profile-output")
+p.sort_stats('time')
+p.print_stats(40)