[PATCH] knfsd: Improve UTF8 checking.

From: NeilBrown <neilb@cse.unsw.edu.au> From: Fred. We don't do all the utf8 checking we could in the kernel, but we do some simple checks. Implement slightly stricter, and probably more efficient, checking.

[PATCH] knfsd: Improve UTF8 checking.
From: NeilBrown <neilb@cse.unsw.edu.au> From: Fred. We don't do all the utf8 checking we could in the kernel, but we do some simple checks. Implement slightly stricter, and probably more efficient, checking.
1a260c78 · Andrew Morton · Linus Torvalds · c02c0886 · 1a260c78
Commit 1a260c78 authored Apr 11, 2004 by Andrew Morton Committed by Linus Torvalds Apr 11, 2004
Show whitespace changes
Inline Side-by-side

Showing with 81 additions and 74 deletions

fs/nfsd/nfs4xdr.c fs/nfsd/nfs4xdr.c +81 -74

No files found.
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -58,93 +58,94 @@
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
-/*
+static const char utf8_byte_len[256] = {
- * From Peter Astrand <peter@cendio.se>: The following routines check
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- * whether a filename supplied by the client is valid.
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- */
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-static const char trailing_bytes_for_utf8[256] = {
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
 };
 static inline int
-is_legal_iso_utf8_sequence(unsigned char *source, int length)
+is_legal_utf8_sequence(unsigned char *source, int length)
 {
-	unsigned char a;
+	unsigned char *ptr;
-	unsigned char *srcptr;
+	unsigned char c;
-	srcptr = source + length;
-	switch (length) {
-		/* Everything else falls through when "1"... */
-	default:
-		/* Sequences with more than 6 bytes are invalid */
-		return 0;
-		/*
+	if (length==1) return 1;
-		   Byte 3-6 must be 80..BF
-		*/
-	case 6:
-		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
-	case 5:
-		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
-	case 4:
-		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
-	case 3:
-		if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
-	case 2:
-		a = *--srcptr;
-		/* Upper limit */
-		if (a > 0xBF)
-			/* 2nd byte may never be > 0xBF */
-			return 0;
-		/*
+	/* Check for overlong sequence, and check second byte */
-		   Lower limits checks, to detect non-shortest forms.
+	c = *(source + 1);
-		   No fall-through in this inner switch.
-		*/
 	switch (*source) {
 	case 0xE0: /* 3 bytes */
-			if (a < 0xA0) return 0;
+		if ( c < 0xA0 ) return 0;
 		break;
 	case 0xF0: /* 4 bytes */
-			if (a < 0x90) return 0;
+		if ( c < 0x90 ) return 0;
 		break;
 	case 0xF8: /* 5 bytes */
-			if (a < 0xC8) return 0;
+		if ( c < 0xC8 ) return 0;
 		break;
 	case 0xFC: /* 6 bytes */
-			if (a < 0x84) return 0;
+		if ( c < 0x84 ) return 0;
 		break;
 	default:
-			/* In all cases, 2nd byte must be >= 0x80 (because leading
+		if ( (c & 0xC0) != 0x80) return 0;
-			   10...) */
-			if (a < 0x80) return 0;
 	}
-	case 1:
+	/* Check that trailing bytes look like 10xxxxxx */
-		/* Invalid ranges */
+	for (ptr = source++ + length - 1; ptr>source; ptr--)
-		if (*source >= 0x80 && *source < 0xC2)
+		if ( ((*ptr) & 0xC0) != 0x80 ) return 0;
-			/* Multibyte char with value < 0xC2, non-shortest */
+	return 1;
-			return 0;
+}
-		if (*source > 0xFD)
-			/* Leading byte starting with 11111110 is illegal */
+/* This does some screening on disallowed unicode characters.  It is NOT
-			return 0;
+ * comprehensive.
-		if (!*source)
+ */
+static int
+is_allowed_utf8_char(unsigned char *source, int length)
+{
+	/* We assume length and source point to a valid utf8 sequence */
+	unsigned char c;
+	/* Disallow F0000 and up (in utf8, F3B08080) */
+	if (*source > 0xF3 ) return 0;
+	c = *(source + 1);
+	switch (*source) {
+	case 0xF3:
+		if (c >= 0xB0) return 0;
+		break;
+	/* Disallow D800-F8FF (in utf8, EDA080-EFA3BF */
+	case 0xED:
+		if (c >= 0xA0) return 0;
+		break;
+	case 0xEE:
 		return 0;
+		break;
+	case 0xEF:
+		if (c <= 0xA3) return 0;
+	/* Disallow FFF9-FFFF (EFBFB9-EFBFBF) */
+		if (c==0xBF)
+			/* Don't need to check <=0xBF, since valid utf8 */
+			if ( *(source+2) >= 0xB9) return 0;
+		break;
 	}
 	return 1;
 }
+/* This routine should really check to see that the proper stringprep
+ * mappings have been applied.  Instead, we do a simple screen of some
+ * of the more obvious illegal values by calling is_allowed_utf8_char.
+ * This will allow many illegal strings through, but if a client behaves,
+ * it will get full functionality.  The other option (apart from full
+ * stringprep checking) is to limit everything to an easily handled subset,
+ * such as 7-bit ascii.
+ *
+ * Note - currently calling routines ignore return value except as boolean.
+ */
 static int
 check_utf8(char *str, int len)
 {
@@ -155,11 +156,17 @@ check_utf8(char *str, int len)
 	sourceend = str + len;
 	while (chunk < sourceend) {
-		chunklen = trailing_bytes_for_utf8[*chunk]+1;
+		chunklen = utf8_byte_len[*chunk];
+		if (!chunklen)
+			return nfserr_inval;
 		if (chunk + chunklen > sourceend)
 			return nfserr_inval;
-		if (!is_legal_iso_utf8_sequence(chunk, chunklen))
+		if (!is_legal_utf8_sequence(chunk, chunklen))
+			return nfserr_inval;
+		if (!is_allowed_utf8_char(chunk, chunklen))
 			return nfserr_inval;
+		if ( (chunklen==1) && (!*chunk) )
+			return nfserr_inval; /* Disallow embedded nulls */
 		chunk += chunklen;
 	}