lib/lz4: update LZ4 decompressor module

Update the LZ4 compression module based on LZ4 v1.8.3 in order for the erofs file system to use the newest LZ4_decompress_safe_partial() which can now decode exactly the nb of bytes requested [1] to take place of the open hacked code in the erofs file system itself. Currently, apart from the erofs file system, no other users use LZ4_decompress_safe_partial, so no worry about the interface. In addition, LZ4 v1.8.x boosts up decompression speed compared to the current code which is based on LZ4 v1.7.3, mainly due to shortcut optimization for the specific common LZ4-sequences [2]. lzbench testdata (tested in kirin710, 8 cores, 4 big cores at 2189Mhz, 2GB DDR RAM at 1622Mhz, with enwik8 testdata [3]): Compressor name Compress. Decompress. Compr. size Ratio Filename memcpy 5004 MB/s 4924 MB/s 100000000 100.00 enwik8 lz4hc 1.7.3 -9 12 MB/s 653 MB/s 42203253 42.20 enwik8 lz4hc 1.8.0 -9 12 MB/s 908 MB/s 42203096 42.20 enwik8 lz4hc 1.8.3 -9 11 MB/s 965 MB/s 42203094 42.20 enwik8 [1] https://github.com/lz4/lz4/issues/566 https://github.com/lz4/lz4/commit/08d347b5b217b011ff7487130b79480d8cfdaeb8 [2] v1.8.1 perf: slightly faster compression and decompression speed https://github.com/lz4/lz4/commit/a31b7058cb97e4393da55e78a77a1c6f0c9ae038 v1.8.2 perf: slightly faster HC compression and decompression speed https://github.com/lz4/lz4/commit/45f8603aae389d34c689d3ff7427b314071ccd2c https://github.com/lz4/lz4/commit/1a191b3f8d26b50a7c1d41590b529ec308d768cd [3] http://mattmahoney.net/dc/textdata.html http://mattmahoney.net/dc/enwik8.zip Link: http://lkml.kernel.org/r/1537181207-21932-1-git-send-email-gaoxiang25@huawei.comSigned-off-by: Gao Xiang <gaoxiang25@huawei.com> Tested-by: Guo Xuenan <guoxuenan@huawei.com> Cc: Colin Ian King <colin.king@canonical.com> Cc: Yann Collet <yann.collet.73@gmail.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Fang Wei <fangwei1@huawei.com> Cc: Chao Yu <yuchao0@huawei.com> Cc: Miao Xie <miaoxie@huawei.com> Cc: Sven Schmidt <4sschmid@informatik.uni-hamburg.de> Cc: Kyungsik Lee <kyungsik.lee@lge.com> Cc: <weidu.du@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

lib/lz4: update LZ4 decompressor module
Update the LZ4 compression module based on LZ4 v1.8.3 in order for the erofs file system to use the newest LZ4_decompress_safe_partial() which can now decode exactly the nb of bytes requested [1] to take place of the open hacked code in the erofs file system itself. Currently, apart from the erofs file system, no other users use LZ4_decompress_safe_partial, so no worry about the interface. In addition, LZ4 v1.8.x boosts up decompression speed compared to the current code which is based on LZ4 v1.7.3, mainly due to shortcut optimization for the specific common LZ4-sequences [2]. lzbench testdata (tested in kirin710, 8 cores, 4 big cores at 2189Mhz, 2GB DDR RAM at 1622Mhz, with enwik8 testdata [3]): Compressor name Compress. Decompress. Compr. size Ratio Filename memcpy 5004 MB/s 4924 MB/s 100000000 100.00 enwik8 lz4hc 1.7.3 -9 12 MB/s 653 MB/s 42203253 42.20 enwik8 lz4hc 1.8.0 -9 12 MB/s 908 MB/s 42203096 42.20 enwik8 lz4hc 1.8.3 -9 11 MB/s 965 MB/s 42203094 42.20 enwik8 [1] https://github.com/lz4/lz4/issues/566 https://github.com/lz4/lz4/commit/08d347b5b217b011ff7487130b79480d8cfdaeb8 [2] v1.8.1 perf: slightly faster compression and decompression speed https://github.com/lz4/lz4/commit/a31b7058cb97e4393da55e78a77a1c6f0c9ae038 v1.8.2 perf: slightly faster HC compression and decompression speed https://github.com/lz4/lz4/commit/45f8603aae389d34c689d3ff7427b314071ccd2c https://github.com/lz4/lz4/commit/1a191b3f8d26b50a7c1d41590b529ec308d768cd [3] http://mattmahoney.net/dc/textdata.html http://mattmahoney.net/dc/enwik8.zip Link: http://lkml.kernel.org/r/1537181207-21932-1-git-send-email-gaoxiang25@huawei.comSigned-off-by: Gao Xiang <gaoxiang25@huawei.com> Tested-by: Guo Xuenan <guoxuenan@huawei.com> Cc: Colin Ian King <colin.king@canonical.com> Cc: Yann Collet <yann.collet.73@gmail.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Fang Wei <fangwei1@huawei.com> Cc: Chao Yu <yuchao0@huawei.com> Cc: Miao Xie <miaoxie@huawei.com> Cc: Sven Schmidt <4sschmid@informatik.uni-hamburg.de> Cc: Kyungsik Lee <kyungsik.lee@lge.com> Cc: <weidu.du@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2209fda3 · Gao Xiang · Linus Torvalds · 8c81ddd2 · 2209fda3 · 2209fda3
Commit 2209fda3 authored Oct 30, 2018 by Gao Xiang Committed by Linus Torvalds Oct 31, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 349 additions and 141 deletions

lib/lz4/lz4_decompress.c lib/lz4/lz4_decompress.c +341 -140

lib/lz4/lz4defs.h lib/lz4/lz4defs.h +8 -1

No files found.
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -43,30 +43,36 @@
 /*-*****************************
 *	Decompression functions
 *******************************/
-/* LZ4_decompress_generic() :
- * This generic decompression function cover all use cases.
+#define DEBUGLOG(l, ...) {}	/* disabled */
- * It shall be instantiated several times, using different sets of directives
- * Note that it is important this generic function is really inlined,
+#ifndef assert
+#define assert(condition) ((void)0)
+#endif
+/*
+ * LZ4_decompress_generic() :
+ * This generic decompression function covers all use cases.
+ * It shall be instantiated several times, using different sets of directives.
+ * Note that it is important for performance that this function really get inlined,
 * in order to remove useless branches during compilation optimization.
 */
 static FORCE_INLINE int LZ4_decompress_generic(
-	 const char * const source,
+	 const char * const src,
-	 char * const dest,
+	 char * const dst,
-	 int inputSize,
+	 int srcSize,
 		/*
 		 * If endOnInput == endOnInputSize,
-		 * this value is the max size of Output Buffer.
+		 * this value is `dstCapacity`
 		 */
 	 int outputSize,
 	 /* endOnOutputSize, endOnInputSize */
-	 int endOnInput,
+	 endCondition_directive endOnInput,
 	 /* full, partial */
-	 int partialDecoding,
+	 earlyEnd_directive partialDecoding,
-	 /* only used if partialDecoding == partial */
-	 int targetOutputSize,
 	 /* noDict, withPrefix64k, usingExtDict */
-	 int dict,
+	 dict_directive dict,
-	 /* == dest when no prefix */
+	 /* always <= dst, == dst when no prefix */
 	 const BYTE * const lowPrefix,
 	 /* only if dict == usingExtDict */
 	 const BYTE * const dictStart,
@@ -74,35 +80,43 @@ static FORCE_INLINE int LZ4_decompress_generic(
 	 const size_t dictSize
 	 )
 {
-	/* Local Variables */
+	const BYTE *ip = (const BYTE *) src;
-	const BYTE *ip = (const BYTE *) source;
+	const BYTE * const iend = ip + srcSize;
-	const BYTE * const iend = ip + inputSize;
-	BYTE *op = (BYTE *) dest;
+	BYTE *op = (BYTE *) dst;
 	BYTE * const oend = op + outputSize;
 	BYTE *cpy;
-	BYTE *oexit = op + targetOutputSize;
-	const BYTE * const lowLimit = lowPrefix - dictSize;
 	const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
-	static const unsigned int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };
+	static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
-	static const int dec64table[] = { 0, 0, 0, -1, 0, 1, 2, 3 };
+	static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
 	const int safeDecode = (endOnInput == endOnInputSize);
 	const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));
+	/* Set up the "end" pointers for the shortcut. */
+	const BYTE *const shortiend = iend -
+		(endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
+	const BYTE *const shortoend = oend -
+		(endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
+	DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
+		 srcSize, outputSize);
 	/* Special cases */
-	/* targetOutputSize too high => decode everything */
+	assert(lowPrefix <= op);
-	if ((partialDecoding) && (oexit > oend - MFLIMIT))
+	assert(src != NULL);
-		oexit = oend - MFLIMIT;
 	/* Empty output buffer */
 	if ((endOnInput) && (unlikely(outputSize == 0)))
-		return ((inputSize == 1) && (*ip == 0)) ? 0 : -1;
+		return ((srcSize == 1) && (*ip == 0)) ? 0 : -1;
 	if ((!endOnInput) && (unlikely(outputSize == 0)))
 		return (*ip == 0 ? 1 : -1);
+	if ((endOnInput) && unlikely(srcSize == 0))
+		return -1;
 	/* Main Loop : decode sequences */
 	while (1) {
 		size_t length;
@@ -111,12 +125,74 @@ static FORCE_INLINE int LZ4_decompress_generic(
 		/* get literal length */
 		unsigned int const token = *ip++;
 		length = token>>ML_BITS;
+		/* ip < iend before the increment */
+		assert(!endOnInput || ip <= iend);
+		/*
+		 * A two-stage shortcut for the most common case:
+		 * 1) If the literal length is 0..14, and there is enough
+		 * space, enter the shortcut and copy 16 bytes on behalf
+		 * of the literals (in the fast mode, only 8 bytes can be
+		 * safely copied this way).
+		 * 2) Further if the match length is 4..18, copy 18 bytes
+		 * in a similar manner; but we ensure that there's enough
+		 * space in the output for those 18 bytes earlier, upon
+		 * entering the shortcut (in other words, there is a
+		 * combined check for both stages).
+		 */
+		if ((endOnInput ? length != RUN_MASK : length <= 8)
+		   /*
+		    * strictly "less than" on input, to re-enter
+		    * the loop with at least one byte
+		    */
+		   && likely((endOnInput ? ip < shortiend : 1) &
+			     (op <= shortoend))) {
+			/* Copy the literals */
+			memcpy(op, ip, endOnInput ? 16 : 8);
+			op += length; ip += length;
+			/*
+			 * The second stage:
+			 * prepare for match copying, decode full info.
+			 * If it doesn't work out, the info won't be wasted.
+			 */
+			length = token & ML_MASK; /* match length */
+			offset = LZ4_readLE16(ip);
+			ip += 2;
+			match = op - offset;
+			assert(match <= op); /* check overflow */
+			/* Do not deal with overlapping matches. */
+			if ((length != ML_MASK) &&
+			    (offset >= 8) &&
+			    (dict == withPrefix64k || match >= lowPrefix)) {
+				/* Copy the match. */
+				memcpy(op + 0, match + 0, 8);
+				memcpy(op + 8, match + 8, 8);
+				memcpy(op + 16, match + 16, 2);
+				op += length + MINMATCH;
+				/* Both stages worked, load the next token. */
+				continue;
+			}
+			/*
+			 * The second stage didn't work out, but the info
+			 * is ready. Propel it right to the point of match
+			 * copying.
+			 */
+			goto _copy_match;
+		}
+		/* decode literal length */
 		if (length == RUN_MASK) {
 			unsigned int s;
+			if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
+				/* overflow detection */
+				goto _output_error;
+			}
 			do {
 				s = *ip++;
 				length += s;
@@ -125,14 +201,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
 				: 1) & (s == 255));
 			if ((safeDecode)
-				&& unlikely(
+			    && unlikely((uptrval)(op) +
-					(size_t)(op + length) < (size_t)(op))) {
+					length < (uptrval)(op))) {
 				/* overflow detection */
 				goto _output_error;
 			}
 			if ((safeDecode)
-				&& unlikely(
+			    && unlikely((uptrval)(ip) +
-					(size_t)(ip + length) < (size_t)(ip))) {
+					length < (uptrval)(ip))) {
 				/* overflow detection */
 				goto _output_error;
 			}
@@ -140,16 +216,19 @@ static FORCE_INLINE int LZ4_decompress_generic(
 		/* copy literals */
 		cpy = op + length;
-		if (((endOnInput) && ((cpy > (partialDecoding ? oexit : oend - MFLIMIT))
+		LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+		if (((endOnInput) && ((cpy > oend - MFLIMIT)
 			|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
 			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
 			if (partialDecoding) {
 				if (cpy > oend) {
 					/*
-					 * Error :
+					 * Partial decoding :
-					 * write attempt beyond end of output buffer
+					 * stop in the middle of literal segment
 					 */
-					goto _output_error;
+					cpy = oend;
+					length = oend - op;
 				}
 				if ((endOnInput)
 					&& (ip + length > iend)) {
@@ -184,29 +263,43 @@ static FORCE_INLINE int LZ4_decompress_generic(
 			memcpy(op, ip, length);
 			ip += length;
 			op += length;
 			/* Necessarily EOF, due to parsing restrictions */
-			break;
+			if (!partialDecoding || (cpy == oend))
+				break;
+		} else {
+			/* may overwrite up to WILDCOPYLENGTH beyond cpy */
+			LZ4_wildCopy(op, ip, cpy);
+			ip += length;
+			op = cpy;
 		}
-		LZ4_wildCopy(op, ip, cpy);
-		ip += length;
-		op = cpy;
 		/* get offset */
 		offset = LZ4_readLE16(ip);
 		ip += 2;
 		match = op - offset;
-		if ((checkOffset) && (unlikely(match < lowLimit))) {
+		/* get matchlength */
+		length = token & ML_MASK;
+_copy_match:
+		if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
 			/* Error : offset outside buffers */
 			goto _output_error;
 		}
 		/* costs ~1%; silence an msan warning when offset == 0 */
-		LZ4_write32(op, (U32)offset);
+		/*
+		 * note : when partialDecoding, there is no guarantee that
+		 * at least 4 bytes remain available in output buffer
+		 */
+		if (!partialDecoding) {
+			assert(oend > op);
+			assert(oend - op >= 4);
+			LZ4_write32(op, (U32)offset);
+		}
-		/* get matchlength */
-		length = token & ML_MASK;
 		if (length == ML_MASK) {
 			unsigned int s;
@@ -221,7 +314,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 			if ((safeDecode)
 				&& unlikely(
-					(size_t)(op + length) < (size_t)op)) {
+					(uptrval)(op) + length < (uptrval)op)) {
 				/* overflow detection */
 				goto _output_error;
 			}
@@ -229,24 +322,26 @@ static FORCE_INLINE int LZ4_decompress_generic(
 		length += MINMATCH;
-		/* check external dictionary */
+		/* match starting within external dictionary */
 		if ((dict == usingExtDict) && (match < lowPrefix)) {
 			if (unlikely(op + length > oend - LASTLITERALS)) {
 				/* doesn't respect parsing restriction */
-				goto _output_error;
+				if (!partialDecoding)
+					goto _output_error;
+				length = min(length, (size_t)(oend - op));
 			}
 			if (length <= (size_t)(lowPrefix - match)) {
 				/*
-				 * match can be copied as a single segment
+				 * match fits entirely within external
-				 * from external dictionary
+				 * dictionary : just copy
 				 */
 				memmove(op, dictEnd - (lowPrefix - match),
 					length);
 				op += length;
 			} else {
 				/*
-				 * match encompass external
+				 * match stretches into both external
 				 * dictionary and current block
 				 */
 				size_t const copySize = (size_t)(lowPrefix - match);
@@ -254,7 +349,6 @@ static FORCE_INLINE int LZ4_decompress_generic(
 				memcpy(op, dictEnd - copySize, copySize);
 				op += copySize;
 				if (restSize > (size_t)(op - lowPrefix)) {
 					/* overlap copy */
 					BYTE * const endOfMatch = op + restSize;
@@ -267,23 +361,44 @@ static FORCE_INLINE int LZ4_decompress_generic(
 					op += restSize;
 				}
 			}
 			continue;
 		}
 		/* copy match within block */
 		cpy = op + length;
-		if (unlikely(offset < 8)) {
+		/*
-			const int dec64 = dec64table[offset];
+		 * partialDecoding :
+		 * may not respect endBlock parsing restrictions
+		 */
+		assert(op <= oend);
+		if (partialDecoding &&
+		    (cpy > oend - MATCH_SAFEGUARD_DISTANCE)) {
+			size_t const mlen = min(length, (size_t)(oend - op));
+			const BYTE * const matchEnd = match + mlen;
+			BYTE * const copyEnd = op + mlen;
+			if (matchEnd > op) {
+				/* overlap copy */
+				while (op < copyEnd)
+					*op++ = *match++;
+			} else {
+				memcpy(op, match, mlen);
+			}
+			op = copyEnd;
+			if (op == oend)
+				break;
+			continue;
+		}
+		if (unlikely(offset < 8)) {
 			op[0] = match[0];
 			op[1] = match[1];
 			op[2] = match[2];
 			op[3] = match[3];
-			match += dec32table[offset];
+			match += inc32table[offset];
 			memcpy(op + 4, match, 4);
-			match -= dec64;
+			match -= dec64table[offset];
 		} else {
 			LZ4_copy8(op, match);
 			match += 8;
@@ -291,7 +406,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 		op += 8;
-		if (unlikely(cpy > oend - 12)) {
+		if (unlikely(cpy > oend - MATCH_SAFEGUARD_DISTANCE)) {
 			BYTE * const oCopyLimit = oend - (WILDCOPYLENGTH - 1);
 			if (cpy > oend - LASTLITERALS) {
@@ -307,60 +422,139 @@ static FORCE_INLINE int LZ4_decompress_generic(
 				match += oCopyLimit - op;
 				op = oCopyLimit;
 			}
 			while (op < cpy)
 				*op++ = *match++;
 		} else {
 			LZ4_copy8(op, match);
 			if (length > 16)
 				LZ4_wildCopy(op + 8, match + 8, cpy);
 		}
+		op = cpy; /* wildcopy correction */
-		op = cpy; /* correction */
 	}
 	/* end of decoding */
 	if (endOnInput) {
 		/* Nb of output bytes decoded */
-		return (int) (((char *)op) - dest);
+		return (int) (((char *)op) - dst);
 	} else {
 		/* Nb of input bytes read */
-		return (int) (((const char *)ip) - source);
+		return (int) (((const char *)ip) - src);
 	}
 	/* Overflow error detected */
 _output_error:
-	return -1;
+	return (int) (-(((const char *)ip) - src)) - 1;
 }
 int LZ4_decompress_safe(const char *source, char *dest,
 	int compressedSize, int maxDecompressedSize)
 {
-	return LZ4_decompress_generic(source, dest, compressedSize,
+	return LZ4_decompress_generic(source, dest,
-		maxDecompressedSize, endOnInputSize, full, 0,
+				      compressedSize, maxDecompressedSize,
-		noDict, (BYTE *)dest, NULL, 0);
+				      endOnInputSize, decode_full_block,
+				      noDict, (BYTE *)dest, NULL, 0);
 }
-int LZ4_decompress_safe_partial(const char *source, char *dest,
+int LZ4_decompress_safe_partial(const char *src, char *dst,
-	int compressedSize, int targetOutputSize, int maxDecompressedSize)
+	int compressedSize, int targetOutputSize, int dstCapacity)
 {
-	return LZ4_decompress_generic(source, dest, compressedSize,
+	dstCapacity = min(targetOutputSize, dstCapacity);
-		maxDecompressedSize, endOnInputSize, partial,
+	return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity,
-		targetOutputSize, noDict, (BYTE *)dest, NULL, 0);
+				      endOnInputSize, partial_decode,
+				      noDict, (BYTE *)dst, NULL, 0);
 }
 int LZ4_decompress_fast(const char *source, char *dest, int originalSize)
 {
 	return LZ4_decompress_generic(source, dest, 0, originalSize,
-		endOnOutputSize, full, 0, withPrefix64k,
+				      endOnOutputSize, decode_full_block,
-		(BYTE *)(dest - 64 * KB), NULL, 64 * KB);
+				      withPrefix64k,
+				      (BYTE *)dest - 64 * KB, NULL, 0);
+}
+/* ===== Instantiate a few more decoding cases, used more than once. ===== */
+int LZ4_decompress_safe_withPrefix64k(const char *source, char *dest,
+				      int compressedSize, int maxOutputSize)
+{
+	return LZ4_decompress_generic(source, dest,
+				      compressedSize, maxOutputSize,
+				      endOnInputSize, decode_full_block,
+				      withPrefix64k,
+				      (BYTE *)dest - 64 * KB, NULL, 0);
+}
+static int LZ4_decompress_safe_withSmallPrefix(const char *source, char *dest,
+					       int compressedSize,
+					       int maxOutputSize,
+					       size_t prefixSize)
+{
+	return LZ4_decompress_generic(source, dest,
+				      compressedSize, maxOutputSize,
+				      endOnInputSize, decode_full_block,
+				      noDict,
+				      (BYTE *)dest - prefixSize, NULL, 0);
+}
+int LZ4_decompress_safe_forceExtDict(const char *source, char *dest,
+				     int compressedSize, int maxOutputSize,
+				     const void *dictStart, size_t dictSize)
+{
+	return LZ4_decompress_generic(source, dest,
+				      compressedSize, maxOutputSize,
+				      endOnInputSize, decode_full_block,
+				      usingExtDict, (BYTE *)dest,
+				      (const BYTE *)dictStart, dictSize);
 }
+static int LZ4_decompress_fast_extDict(const char *source, char *dest,
+				       int originalSize,
+				       const void *dictStart, size_t dictSize)
+{
+	return LZ4_decompress_generic(source, dest,
+				      0, originalSize,
+				      endOnOutputSize, decode_full_block,
+				      usingExtDict, (BYTE *)dest,
+				      (const BYTE *)dictStart, dictSize);
+}
+/*
+ * The "double dictionary" mode, for use with e.g. ring buffers: the first part
+ * of the dictionary is passed as prefix, and the second via dictStart + dictSize.
+ * These routines are used only once, in LZ4_decompress_*_continue().
+ */
+static FORCE_INLINE
+int LZ4_decompress_safe_doubleDict(const char *source, char *dest,
+				   int compressedSize, int maxOutputSize,
+				   size_t prefixSize,
+				   const void *dictStart, size_t dictSize)
+{
+	return LZ4_decompress_generic(source, dest,
+				      compressedSize, maxOutputSize,
+				      endOnInputSize, decode_full_block,
+				      usingExtDict, (BYTE *)dest - prefixSize,
+				      (const BYTE *)dictStart, dictSize);
+}
+static FORCE_INLINE
+int LZ4_decompress_fast_doubleDict(const char *source, char *dest,
+				   int originalSize, size_t prefixSize,
+				   const void *dictStart, size_t dictSize)
+{
+	return LZ4_decompress_generic(source, dest,
+				      0, originalSize,
+				      endOnOutputSize, decode_full_block,
+				      usingExtDict, (BYTE *)dest - prefixSize,
+				      (const BYTE *)dictStart, dictSize);
+}
+/* ===== streaming decompression functions ===== */
 int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
 	const char *dictionary, int dictSize)
 {
-	LZ4_streamDecode_t_internal *lz4sd = (LZ4_streamDecode_t_internal *) LZ4_streamDecode;
+	LZ4_streamDecode_t_internal *lz4sd =
+		&LZ4_streamDecode->internal_donotuse;
 	lz4sd->prefixSize = (size_t) dictSize;
 	lz4sd->prefixEnd = (const BYTE *) dictionary + dictSize;
@@ -382,35 +576,51 @@ int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
 int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
 	const char *source, char *dest, int compressedSize, int maxOutputSize)
 {
-	LZ4_streamDecode_t_internal *lz4sd = &LZ4_streamDecode->internal_donotuse;
+	LZ4_streamDecode_t_internal *lz4sd =
+		&LZ4_streamDecode->internal_donotuse;
 	int result;
-	if (lz4sd->prefixEnd == (BYTE *)dest) {
+	if (lz4sd->prefixSize == 0) {
-		result = LZ4_decompress_generic(source, dest,
+		/* The first call, no dictionary yet. */
-			compressedSize,
+		assert(lz4sd->extDictSize == 0);
-			maxOutputSize,
+		result = LZ4_decompress_safe(source, dest,
-			endOnInputSize, full, 0,
+			compressedSize, maxOutputSize);
-			usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize,
+		if (result <= 0)
-			lz4sd->externalDict,
+			return result;
-			lz4sd->extDictSize);
+		lz4sd->prefixSize = result;
+		lz4sd->prefixEnd = (BYTE *)dest + result;
+	} else if (lz4sd->prefixEnd == (BYTE *)dest) {
+		/* They're rolling the current segment. */
+		if (lz4sd->prefixSize >= 64 * KB - 1)
+			result = LZ4_decompress_safe_withPrefix64k(source, dest,
+				compressedSize, maxOutputSize);
+		else if (lz4sd->extDictSize == 0)
+			result = LZ4_decompress_safe_withSmallPrefix(source,
+				dest, compressedSize, maxOutputSize,
+				lz4sd->prefixSize);
+		else
+			result = LZ4_decompress_safe_doubleDict(source, dest,
+				compressedSize, maxOutputSize,
+				lz4sd->prefixSize,
+				lz4sd->externalDict, lz4sd->extDictSize);
 		if (result <= 0)
 			return result;
 		lz4sd->prefixSize += result;
-		lz4sd->prefixEnd	+= result;
+		lz4sd->prefixEnd  += result;
 	} else {
+		/*
+		 * The buffer wraps around, or they're
+		 * switching to another buffer.
+		 */
 		lz4sd->extDictSize = lz4sd->prefixSize;
 		lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
-		result = LZ4_decompress_generic(source, dest,
+		result = LZ4_decompress_safe_forceExtDict(source, dest,
 			compressedSize, maxOutputSize,
-			endOnInputSize, full, 0,
-			usingExtDict, (BYTE *)dest,
 			lz4sd->externalDict, lz4sd->extDictSize);
 		if (result <= 0)
 			return result;
 		lz4sd->prefixSize = result;
-		lz4sd->prefixEnd	= (BYTE *)dest + result;
+		lz4sd->prefixEnd  = (BYTE *)dest + result;
 	}
 	return result;
@@ -422,75 +632,66 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
 	LZ4_streamDecode_t_internal *lz4sd = &LZ4_streamDecode->internal_donotuse;
 	int result;
-	if (lz4sd->prefixEnd == (BYTE *)dest) {
+	if (lz4sd->prefixSize == 0) {
-		result = LZ4_decompress_generic(source, dest, 0, originalSize,
+		assert(lz4sd->extDictSize == 0);
-			endOnOutputSize, full, 0,
+		result = LZ4_decompress_fast(source, dest, originalSize);
-			usingExtDict,
+		if (result <= 0)
-			lz4sd->prefixEnd - lz4sd->prefixSize,
+			return result;
-			lz4sd->externalDict, lz4sd->extDictSize);
+		lz4sd->prefixSize = originalSize;
+		lz4sd->prefixEnd = (BYTE *)dest + originalSize;
+	} else if (lz4sd->prefixEnd == (BYTE *)dest) {
+		if (lz4sd->prefixSize >= 64 * KB - 1 ||
+		    lz4sd->extDictSize == 0)
+			result = LZ4_decompress_fast(source, dest,
+						     originalSize);
+		else
+			result = LZ4_decompress_fast_doubleDict(source, dest,
+				originalSize, lz4sd->prefixSize,
+				lz4sd->externalDict, lz4sd->extDictSize);
 		if (result <= 0)
 			return result;
 		lz4sd->prefixSize += originalSize;
-		lz4sd->prefixEnd	+= originalSize;
+		lz4sd->prefixEnd  += originalSize;
 	} else {
 		lz4sd->extDictSize = lz4sd->prefixSize;
 		lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
-		result = LZ4_decompress_generic(source, dest, 0, originalSize,
+		result = LZ4_decompress_fast_extDict(source, dest,
-			endOnOutputSize, full, 0,
+			originalSize, lz4sd->externalDict, lz4sd->extDictSize);
-			usingExtDict, (BYTE *)dest,
-			lz4sd->externalDict, lz4sd->extDictSize);
 		if (result <= 0)
 			return result;
 		lz4sd->prefixSize = originalSize;
-		lz4sd->prefixEnd	= (BYTE *)dest + originalSize;
+		lz4sd->prefixEnd = (BYTE *)dest + originalSize;
 	}
 	return result;
 }
-/*
+int LZ4_decompress_safe_usingDict(const char *source, char *dest,
- * Advanced decoding functions :
+				  int compressedSize, int maxOutputSize,
- * *_usingDict() :
+				  const char *dictStart, int dictSize)
- * These decoding functions work the same as "_continue" ones,
- * the dictionary must be explicitly provided within parameters
- */
-static FORCE_INLINE int LZ4_decompress_usingDict_generic(const char *source,
-	char *dest, int compressedSize, int maxOutputSize, int safe,
-	const char *dictStart, int dictSize)
 {
 	if (dictSize == 0)
-		return LZ4_decompress_generic(source, dest,
+		return LZ4_decompress_safe(source, dest,
-			compressedSize, maxOutputSize, safe, full, 0,
+					   compressedSize, maxOutputSize);
-			noDict, (BYTE *)dest, NULL, 0);
+	if (dictStart+dictSize == dest) {
-	if (dictStart + dictSize == dest) {
+		if (dictSize >= 64 * KB - 1)
-		if (dictSize >= (int)(64 * KB - 1))
+			return LZ4_decompress_safe_withPrefix64k(source, dest,
-			return LZ4_decompress_generic(source, dest,
+				compressedSize, maxOutputSize);
-				compressedSize, maxOutputSize, safe, full, 0,
+		return LZ4_decompress_safe_withSmallPrefix(source, dest,
-				withPrefix64k, (BYTE *)dest - 64 * KB, NULL, 0);
+			compressedSize, maxOutputSize, dictSize);
-		return LZ4_decompress_generic(source, dest, compressedSize,
-			maxOutputSize, safe, full, 0, noDict,
-			(BYTE *)dest - dictSize, NULL, 0);
 	}
-	return LZ4_decompress_generic(source, dest, compressedSize,
+	return LZ4_decompress_safe_forceExtDict(source, dest,
-		maxOutputSize, safe, full, 0, usingExtDict,
+		compressedSize, maxOutputSize, dictStart, dictSize);
-		(BYTE *)dest, (const BYTE *)dictStart, dictSize);
-}
-int LZ4_decompress_safe_usingDict(const char *source, char *dest,
-	int compressedSize, int maxOutputSize,
-	const char *dictStart, int dictSize)
-{
-	return LZ4_decompress_usingDict_generic(source, dest,
-		compressedSize, maxOutputSize, 1, dictStart, dictSize);
 }
 int LZ4_decompress_fast_usingDict(const char *source, char *dest,
-	int originalSize, const char *dictStart, int dictSize)
+				  int originalSize,
+				  const char *dictStart, int dictSize)
 {
-	return LZ4_decompress_usingDict_generic(source, dest, 0,
+	if (dictSize == 0 || dictStart + dictSize == dest)
-		originalSize, 0, dictStart, dictSize);
+		return LZ4_decompress_fast(source, dest, originalSize);
+	return LZ4_decompress_fast_extDict(source, dest, originalSize,
+		dictStart, dictSize);
 }
 #ifndef STATIC

--- a/lib/lz4/lz4defs.h
+++ b/lib/lz4/lz4defs.h
@@ -75,6 +75,11 @@ typedef uintptr_t uptrval;
 #define WILDCOPYLENGTH 8
 #define LASTLITERALS 5
 #define MFLIMIT (WILDCOPYLENGTH + MINMATCH)
+/*
+ * ensure it's possible to write 2 x wildcopyLength
+ * without overflowing output buffer
+ */
+#define MATCH_SAFEGUARD_DISTANCE  ((2 * WILDCOPYLENGTH) - MINMATCH)
 /* Increase this value ==> compression run slower on incompressible data */
 #define LZ4_SKIPTRIGGER 6
@@ -222,6 +227,8 @@ typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive;
 typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
 typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
-typedef enum { full = 0, partial = 1 } earlyEnd_directive;
+typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive;
+#define LZ4_STATIC_ASSERT(c)	BUILD_BUG_ON(!(c))
 #endif