Commit 1a260c78 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] knfsd: Improve UTF8 checking.

From: NeilBrown <neilb@cse.unsw.edu.au>

From: Fred.  We don't do all the utf8 checking we could in the kernel, but we
do some simple checks.  Implement slightly stricter, and probably more
efficient, checking.
parent c02c0886
...@@ -58,93 +58,94 @@ ...@@ -58,93 +58,94 @@
#define NFSDDBG_FACILITY NFSDDBG_XDR #define NFSDDBG_FACILITY NFSDDBG_XDR
/* static const char utf8_byte_len[256] = {
* From Peter Astrand <peter@cendio.se>: The following routines check 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
* whether a filename supplied by the client is valid. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
*/ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
static const char trailing_bytes_for_utf8[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
}; };
static inline int static inline int
is_legal_iso_utf8_sequence(unsigned char *source, int length) is_legal_utf8_sequence(unsigned char *source, int length)
{ {
unsigned char a; unsigned char *ptr;
unsigned char *srcptr; unsigned char c;
srcptr = source + length;
switch (length) {
/* Everything else falls through when "1"... */
default:
/* Sequences with more than 6 bytes are invalid */
return 0;
/* if (length==1) return 1;
Byte 3-6 must be 80..BF
*/
case 6:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 5:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 4:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 3:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 2:
a = *--srcptr;
/* Upper limit */
if (a > 0xBF)
/* 2nd byte may never be > 0xBF */
return 0;
/* /* Check for overlong sequence, and check second byte */
Lower limits checks, to detect non-shortest forms. c = *(source + 1);
No fall-through in this inner switch.
*/
switch (*source) { switch (*source) {
case 0xE0: /* 3 bytes */ case 0xE0: /* 3 bytes */
if (a < 0xA0) return 0; if ( c < 0xA0 ) return 0;
break; break;
case 0xF0: /* 4 bytes */ case 0xF0: /* 4 bytes */
if (a < 0x90) return 0; if ( c < 0x90 ) return 0;
break; break;
case 0xF8: /* 5 bytes */ case 0xF8: /* 5 bytes */
if (a < 0xC8) return 0; if ( c < 0xC8 ) return 0;
break; break;
case 0xFC: /* 6 bytes */ case 0xFC: /* 6 bytes */
if (a < 0x84) return 0; if ( c < 0x84 ) return 0;
break; break;
default: default:
/* In all cases, 2nd byte must be >= 0x80 (because leading if ( (c & 0xC0) != 0x80) return 0;
10...) */
if (a < 0x80) return 0;
} }
case 1: /* Check that trailing bytes look like 10xxxxxx */
/* Invalid ranges */ for (ptr = source++ + length - 1; ptr>source; ptr--)
if (*source >= 0x80 && *source < 0xC2) if ( ((*ptr) & 0xC0) != 0x80 ) return 0;
/* Multibyte char with value < 0xC2, non-shortest */ return 1;
return 0; }
if (*source > 0xFD)
/* Leading byte starting with 11111110 is illegal */ /* This does some screening on disallowed unicode characters. It is NOT
return 0; * comprehensive.
if (!*source) */
static int
is_allowed_utf8_char(unsigned char *source, int length)
{
/* We assume length and source point to a valid utf8 sequence */
unsigned char c;
/* Disallow F0000 and up (in utf8, F3B08080) */
if (*source > 0xF3 ) return 0;
c = *(source + 1);
switch (*source) {
case 0xF3:
if (c >= 0xB0) return 0;
break;
/* Disallow D800-F8FF (in utf8, EDA080-EFA3BF */
case 0xED:
if (c >= 0xA0) return 0;
break;
case 0xEE:
return 0; return 0;
break;
case 0xEF:
if (c <= 0xA3) return 0;
/* Disallow FFF9-FFFF (EFBFB9-EFBFBF) */
if (c==0xBF)
/* Don't need to check <=0xBF, since valid utf8 */
if ( *(source+2) >= 0xB9) return 0;
break;
} }
return 1; return 1;
} }
/* This routine should really check to see that the proper stringprep
* mappings have been applied. Instead, we do a simple screen of some
* of the more obvious illegal values by calling is_allowed_utf8_char.
* This will allow many illegal strings through, but if a client behaves,
* it will get full functionality. The other option (apart from full
* stringprep checking) is to limit everything to an easily handled subset,
* such as 7-bit ascii.
*
* Note - currently calling routines ignore return value except as boolean.
*/
static int static int
check_utf8(char *str, int len) check_utf8(char *str, int len)
{ {
...@@ -155,11 +156,17 @@ check_utf8(char *str, int len) ...@@ -155,11 +156,17 @@ check_utf8(char *str, int len)
sourceend = str + len; sourceend = str + len;
while (chunk < sourceend) { while (chunk < sourceend) {
chunklen = trailing_bytes_for_utf8[*chunk]+1; chunklen = utf8_byte_len[*chunk];
if (!chunklen)
return nfserr_inval;
if (chunk + chunklen > sourceend) if (chunk + chunklen > sourceend)
return nfserr_inval; return nfserr_inval;
if (!is_legal_iso_utf8_sequence(chunk, chunklen)) if (!is_legal_utf8_sequence(chunk, chunklen))
return nfserr_inval;
if (!is_allowed_utf8_char(chunk, chunklen))
return nfserr_inval; return nfserr_inval;
if ( (chunklen==1) && (!*chunk) )
return nfserr_inval; /* Disallow embedded nulls */
chunk += chunklen; chunk += chunklen;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment