From 5981ca03b288fa5fce71781a4b9627038c8595a1 Mon Sep 17 00:00:00 2001 From: Marco Mariani <marco.mariani@nexedi.com> Date: Thu, 16 Oct 2014 12:17:04 +0200 Subject: [PATCH] gzip: added --rsyncable option; kvm: compress backup images --- component/gzip/buildout.cfg | 9 + component/gzip/rsyncable.diff | 298 ++++++++++++++++++++ software/kvm/common.cfg | 10 +- software/kvm/instance-kvm-export.cfg.jinja2 | 1 + software/kvm/instance-kvm-import.cfg.in | 2 + software/kvm/instance.cfg.in | 1 + software/kvm/template/kvm-export.sh.jinja2 | 14 + software/kvm/template/kvm-import.sh.jinja2 | 10 +- 8 files changed, 337 insertions(+), 8 deletions(-) create mode 100644 component/gzip/rsyncable.diff diff --git a/component/gzip/buildout.cfg b/component/gzip/buildout.cfg index 6c4b8da44..9f21a4b37 100644 --- a/component/gzip/buildout.cfg +++ b/component/gzip/buildout.cfg @@ -10,3 +10,12 @@ url = ftp://ftp.gnu.org/pub/gnu/gzip/gzip-1.6.tar.xz md5sum = da981f86677d58a106496e68de6f8995 environment = PATH=${xz-utils:location}/bin:%(PATH)s +patch-options = -p1 +# The --rsyncable patch is from debian/ubuntu, +# specifically https://launchpad.net/ubuntu/+source/gzip/1.6-3ubuntu1 +# It is required to minimize the bandwidth used by rsync. +# For an explanation, see http://beeznest.wordpress.com/2005/02/03/rsyncable-gzip/ +# Hunks for .texi files have been removed to avoid a dependency on makeinfo. +patches = + ${:_profile_base_location_}/rsyncable.diff#0587af03a5580e2b7b4007469ee2b601 + diff --git a/component/gzip/rsyncable.diff b/component/gzip/rsyncable.diff new file mode 100644 index 000000000..37781772c --- /dev/null +++ b/component/gzip/rsyncable.diff @@ -0,0 +1,298 @@ +--- a/deflate.c ++++ b/deflate.c +@@ -131,6 +131,14 @@ + #endif + /* Matches of length 3 are discarded if their distance exceeds TOO_FAR */ + ++#ifndef RSYNC_WIN ++# define RSYNC_WIN 8192 ++#endif ++/* Size of rsync window, must be < MAX_DIST */ ++ ++#define RSYNC_SUM_MATCH(sum) (((sum) & (RSYNC_WIN - 1)) == 0) ++/* Whether window sum matches magic value */ ++ + /* =========================================================================== + * Local data used by the "longest match" routines. + */ +@@ -212,6 +220,8 @@ + unsigned good_match; + /* Use a faster search when the previous match is longer than this */ + ++local ulg rsync_sum; /* rolling sum of rsync window */ ++local ulg rsync_chunk_end; /* next rsync sequence point */ + + /* Values for max_lazy_match, good_match and max_chain_length, depending on + * the desired pack level (0..9). The values given below have been tuned to +@@ -314,6 +324,10 @@ + #endif + /* prev will be initialized on the fly */ + ++ /* rsync params */ ++ rsync_chunk_end = 0xFFFFFFFFUL; ++ rsync_sum = 0; ++ + /* Set the default configuration parameters: + */ + max_lazy_match = configuration_table[pack_level].max_lazy; +@@ -550,6 +564,8 @@ + memcpy((char*)window, (char*)window+WSIZE, (unsigned)WSIZE); + match_start -= WSIZE; + strstart -= WSIZE; /* we now have strstart >= MAX_DIST: */ ++ if (rsync_chunk_end != 0xFFFFFFFFUL) ++ rsync_chunk_end -= WSIZE; + + block_start -= (long) WSIZE; + +@@ -579,13 +595,44 @@ + } + } + ++local void rsync_roll(unsigned start, unsigned num) ++{ ++ unsigned i; ++ ++ if (start < RSYNC_WIN) { ++ /* before window fills. */ ++ for (i = start; i < RSYNC_WIN; i++) { ++ if (i == start + num) return; ++ rsync_sum += (ulg)window[i]; ++ } ++ num -= (RSYNC_WIN - start); ++ start = RSYNC_WIN; ++ } ++ ++ /* buffer after window full */ ++ for (i = start; i < start+num; i++) { ++ /* New character in */ ++ rsync_sum += (ulg)window[i]; ++ /* Old character out */ ++ rsync_sum -= (ulg)window[i - RSYNC_WIN]; ++ if (rsync_chunk_end == 0xFFFFFFFFUL && RSYNC_SUM_MATCH(rsync_sum)) ++ rsync_chunk_end = i; ++ } ++} ++ ++/* =========================================================================== ++ * Set rsync_chunk_end if window sum matches magic value. ++ */ ++#define RSYNC_ROLL(s, n) \ ++ do { if (rsync) rsync_roll((s), (n)); } while(0) ++ + /* =========================================================================== + * Flush the current block, with given end-of-file flag. + * IN assertion: strstart is set to the end of the current match. + */ + #define FLUSH_BLOCK(eof) \ + flush_block(block_start >= 0L ? (char*)&window[(unsigned)block_start] : \ +- (char*)NULL, (long)strstart - block_start, (eof)) ++ (char*)NULL, (long)strstart - block_start, flush-1, (eof)) + + /* =========================================================================== + * Processes a new input file and return its compressed length. This +@@ -596,7 +643,7 @@ + local off_t deflate_fast() + { + IPos hash_head; /* head of the hash chain */ +- int flush; /* set if current block must be flushed */ ++ int flush = 0; /* set if current block must be flushed, 2=>and padded */ + unsigned match_length = 0; /* length of best match */ + + prev_length = MIN_MATCH-1; +@@ -626,7 +673,8 @@ + + lookahead -= match_length; + +- /* Insert new strings in the hash table only if the match length ++ RSYNC_ROLL(strstart, match_length); ++ /* Insert new strings in the hash table only if the match length + * is not too large. This saves time but degrades compression. + */ + if (match_length <= max_insert_length) { +@@ -654,9 +702,14 @@ + /* No match, output a literal byte */ + Tracevv((stderr,"%c",window[strstart])); + flush = ct_tally (0, window[strstart]); ++ RSYNC_ROLL(strstart, 1); + lookahead--; + strstart++; + } ++ if (rsync && strstart > rsync_chunk_end) { ++ flush = 2; ++ rsync_chunk_end = 0xFFFFFFFFUL; ++ } + if (flush) FLUSH_BLOCK(0), block_start = strstart; + + /* Make sure that we always have enough lookahead, except +@@ -679,7 +732,7 @@ + { + IPos hash_head; /* head of hash chain */ + IPos prev_match; /* previous match */ +- int flush; /* set if current block must be flushed */ ++ int flush = 0; /* set if current block must be flushed */ + int match_available = 0; /* set if previous match exists */ + register unsigned match_length = MIN_MATCH-1; /* length of best match */ + +@@ -730,6 +783,7 @@ + */ + lookahead -= prev_length-1; + prev_length -= 2; ++ RSYNC_ROLL(strstart, prev_length+1); + do { + strstart++; + INSERT_STRING(strstart, hash_head); +@@ -742,24 +796,38 @@ + match_available = 0; + match_length = MIN_MATCH-1; + strstart++; +- if (flush) FLUSH_BLOCK(0), block_start = strstart; + ++ if (rsync && strstart > rsync_chunk_end) { ++ rsync_chunk_end = 0xFFFFFFFFUL; ++ flush = 2; ++ } ++ if (flush) FLUSH_BLOCK(0), block_start = strstart; + } else if (match_available) { + /* If there was no match at the previous position, output a + * single literal. If there was a match but the current match + * is longer, truncate the previous match to a single literal. + */ + Tracevv((stderr,"%c",window[strstart-1])); +- if (ct_tally (0, window[strstart-1])) { +- FLUSH_BLOCK(0), block_start = strstart; +- } ++ flush = ct_tally (0, window[strstart-1]); ++ if (rsync && strstart > rsync_chunk_end) { ++ rsync_chunk_end = 0xFFFFFFFFUL; ++ flush = 2; ++ } ++ if (flush) FLUSH_BLOCK(0), block_start = strstart; ++ RSYNC_ROLL(strstart, 1); + strstart++; + lookahead--; + } else { + /* There is no previous match to compare with, wait for + * the next step to decide. + */ ++ if (rsync && strstart > rsync_chunk_end) { ++ rsync_chunk_end = 0xFFFFFFFFUL; ++ flush = 2; ++ FLUSH_BLOCK(0), block_start = strstart; ++ } + match_available = 1; ++ RSYNC_ROLL(strstart, 1); + strstart++; + lookahead--; + } +--- a/gzip.c ++++ b/gzip.c +@@ -215,6 +215,7 @@ + unsigned insize; /* valid bytes in inbuf */ + unsigned inptr; /* index of next byte to be processed in inbuf */ + unsigned outcnt; /* bytes in output buffer */ ++int rsync = 0; /* make ryncable chunks */ + + static int handled_sig[] = + { +@@ -275,7 +276,7 @@ + {"best", 0, 0, '9'}, /* compress better */ + {"lzw", 0, 0, 'Z'}, /* make output compatible with old compress */ + {"bits", 1, 0, 'b'}, /* max number of bits per code (implies -Z) */ +- ++ {"rsyncable", 0, 0, 'R'}, /* make rsync-friendly archive */ + { 0, 0, 0, 0 } + }; + +@@ -359,6 +360,7 @@ + " -Z, --lzw produce output compatible with old compress", + " -b, --bits=BITS max number of bits per code (implies -Z)", + #endif ++ " --rsyncable Make rsync-friendly archive", + "", + "With no FILE, or when FILE is -, read standard input.", + "", +@@ -489,8 +491,11 @@ + #else + recursive = 1; + #endif +- break; +- case 'S': ++ break; ++ case 'R': ++ rsync = 1; break; ++ ++ case 'S': + #ifdef NO_MULTIPLE_DOTS + if (*optarg == '.') optarg++; + #endif +--- a/gzip.h ++++ b/gzip.h +@@ -140,6 +140,7 @@ + extern unsigned insize; /* valid bytes in inbuf */ + extern unsigned inptr; /* index of next byte to be processed in inbuf */ + extern unsigned outcnt; /* bytes in output buffer */ ++extern int rsync; /* deflate into rsyncable chunks */ + + extern off_t bytes_in; /* number of input bytes */ + extern off_t bytes_out; /* number of output bytes */ +@@ -287,7 +288,7 @@ + /* in trees.c */ + extern void ct_init (ush *attr, int *method); + extern int ct_tally (int dist, int lc); +-extern off_t flush_block (char *buf, ulg stored_len, int eof); ++extern off_t flush_block (char *buf, ulg stored_len, int pad, int eof); + + /* in bits.c */ + extern void bi_init (file_t zipfile); +--- a/gzip.1 ++++ b/gzip.1 +@@ -5,6 +5,7 @@ + .ll +8 + .B gzip + .RB [ " \-acdfhklLnNrtvV19 " ] ++.RB [ --rsyncable ] + .RB [ \-S\ suffix ] + [ + .I "name \&..." +@@ -287,6 +288,16 @@ + .I gunzip + ). + .TP ++.B --rsyncable ++While compressing, synchronize the output occasionally based on the input. ++This increases size by less than 1 percent most cases, but means that the ++.BR rsync (1) ++program can take advantage of similarities in the uncompressed input ++when syncronizing two files compressed with this flag. ++.I gunzip ++cannot tell the difference between a compressed file created with this option, ++and one created without it. ++.TP + .B \-S .suf --suffix .suf + When compressing, use suffix .suf instead of .gz. + Any non-empty suffix can be given, but suffixes +--- a/trees.c ++++ b/trees.c +@@ -856,9 +856,10 @@ + * trees or store, and output the encoded block to the zip file. This function + * returns the total compressed length for the file so far. + */ +-off_t flush_block(buf, stored_len, eof) ++off_t flush_block(buf, stored_len, pad, eof) + char *buf; /* input block, or NULL if too old */ + ulg stored_len; /* length of input block */ ++ int pad; /* pad output to byte boundary */ + int eof; /* true if this is the last block for a file */ + { + ulg opt_lenb, static_lenb; /* opt_len and static_len in bytes */ +@@ -951,6 +952,10 @@ + Assert (input_len == bytes_in, "bad input size"); + bi_windup(); + compressed_len += 7; /* align on byte boundary */ ++ } else if (pad && (compressed_len % 8) != 0) { ++ send_bits((STORED_BLOCK<<1)+eof, 3); /* send block type */ ++ compressed_len = (compressed_len + 3 + 7) & ~7L; ++ copy_block(buf, 0, 1); /* with header */ + } + + return compressed_len >> 3; diff --git a/software/kvm/common.cfg b/software/kvm/common.cfg index 801601530..36e4c39ad 100644 --- a/software/kvm/common.cfg +++ b/software/kvm/common.cfg @@ -84,7 +84,7 @@ command = [template] recipe = slapos.recipe.template url = ${:_profile_base_location_}/instance.cfg.in -md5sum = bc5a986c7208d02d3284a897ea90b39d +md5sum = 4c8f07da2217e54163c265fe6fe3d41d output = ${buildout:directory}/template.cfg mode = 0644 @@ -115,7 +115,7 @@ on-update = true [template-kvm-import] recipe = slapos.recipe.template url = ${:_profile_base_location_}/instance-kvm-import.cfg.in -md5sum = 98b1acc438895c0fa2309144a9a52a62 +md5sum = 6835c9309ff4bf4a0efd1850e6c66b24 output = ${buildout:directory}/template-kvm-import.cfg mode = 0644 @@ -123,7 +123,7 @@ mode = 0644 recipe = hexagonit.recipe.download url = ${:_profile_base_location_}/template/kvm-import.sh.jinja2 filename = kvm-import.sh.jinja2 -md5sum = 2178e7ef7f1b9ea6639bfbaf045e1feb +md5sum = 926a11421921c29f91fae8240bbcf585 download-only = true mode = 0755 @@ -131,7 +131,7 @@ mode = 0755 recipe = hexagonit.recipe.download url = ${:_profile_base_location_}/instance-kvm-export.cfg.jinja2 mode = 644 -md5sum = 4972bc6f6a4cb41c1820ac13a32915a8 +md5sum = c9f13c1f481ed08c75089aef1d3c6981 download-only = true on-update = true @@ -139,7 +139,7 @@ on-update = true recipe = hexagonit.recipe.download url = ${:_profile_base_location_}/template/kvm-export.sh.jinja2 filename = kvm-export.sh.jinja2 -md5sum = 2e2a99e33e5cdde35833d164da1e325b +md5sum = 22bd2e0c8fdb39a764a14c403a3bd752 download-only = true mode = 0755 diff --git a/software/kvm/instance-kvm-export.cfg.jinja2 b/software/kvm/instance-kvm-export.cfg.jinja2 index 46a6adef4..3d7b0dfb4 100644 --- a/software/kvm/instance-kvm-export.cfg.jinja2 +++ b/software/kvm/instance-kvm-export.cfg.jinja2 @@ -27,6 +27,7 @@ context = section directory directory section buildout buildout key socket_path kvm-instance:socket-path + raw gzip_binary {{ gzip_binary }} # Extends publish section with resilient parameters [publish-connection-information] diff --git a/software/kvm/instance-kvm-import.cfg.in b/software/kvm/instance-kvm-import.cfg.in index 1671063dc..24d25670f 100644 --- a/software/kvm/instance-kvm-import.cfg.in +++ b/software/kvm/instance-kvm-import.cfg.in @@ -33,5 +33,7 @@ mode = 0700 wrapper = $${:rendered} context = section directory directory + raw zcat_binary ${gzip:location}/bin/zcat + raw gzip_binary ${gzip:location}/bin/gzip backup-disk-path = $${directory:backup}/virtual.qcow2 diff --git a/software/kvm/instance.cfg.in b/software/kvm/instance.cfg.in index 43945ff4d..c729513ac 100644 --- a/software/kvm/instance.cfg.in +++ b/software/kvm/instance.cfg.in @@ -82,6 +82,7 @@ context = raw kvm_template $${dynamic-template-kvm:rendered} raw template_kvm_export ${template-kvm-export-script:location}/${template-kvm-export-script:filename} raw pbsready_export_template ${pbsready-export:output} + raw gzip_binary ${gzip:location}/bin/gzip mode = 0644 [dynamic-template-kvm-resilient-test] diff --git a/software/kvm/template/kvm-export.sh.jinja2 b/software/kvm/template/kvm-export.sh.jinja2 index bdb449a28..c2f2c24bd 100644 --- a/software/kvm/template/kvm-export.sh.jinja2 +++ b/software/kvm/template/kvm-export.sh.jinja2 @@ -11,5 +11,19 @@ QMP_CLIENT={{ buildout['directory'] }}/software_release/bin/qemu-qmp-client $QMP_CLIENT --socket {{ socket_path }} --drive-backup $BACKUP_DIR/$BACKUP_FILE +# Due to the way qmp works, the VM file cannot be compressed on the fly. +# Although the compression step is optional, the importer uses the .gz file +# if present. So, remove it if you are disabling the compression. + +# The downside of compression, here, is the temporary usage of more disk space +# in the exporter node. The goal is to minimize disk usage on the PBS server. + +# If you want to compress the file in-place: +# truncate -s $(gzip -c $BACKUP_FILE | dd of=$BACKUP_FILE conv=notrunc 2>&1 | sed -n '$ s/ .*$// p') $BACKUP_FILE +# but the importer script would have to be adapted. + +echo "Compressing backup..." +{{ gzip_binary }} --force --rsyncable $BACKUP_DIR/$BACKUP_FILE + cd $BACKUP_DIR && find -type f ! -name backup.signature -print0 | xargs -P4 -0 sha256sum | LC_ALL=C sort -k 66 > backup.signature diff --git a/software/kvm/template/kvm-import.sh.jinja2 b/software/kvm/template/kvm-import.sh.jinja2 index a8563fc7e..09c8cb910 100644 --- a/software/kvm/template/kvm-import.sh.jinja2 +++ b/software/kvm/template/kvm-import.sh.jinja2 @@ -16,7 +16,11 @@ write_backup_proof () { # For now we just make the diff before write_backup_proof -# TODO: Use rdiff -rm $VM_DIR/$VM_FILE -cp $BACKUP_DIR/$VM_FILE $VM_DIR/$VM_FILE +if [ -f "$BACKUP_DIR/${VM_FILE}.gz" ]; then + {{ gzip_binary }} -t "$BACKUP_DIR/${VM_FILE}.gz" || exit 10 + {{ zcat_binary }} "$BACKUP_DIR/${VM_FILE}.gz" > $VM_DIR/$VM_FILE +else + rm $VM_DIR/$VM_FILE + cp $BACKUP_DIR/$VM_FILE $VM_DIR/$VM_FILE +fi -- 2.30.9