Commit cfa59561 authored by Timothy B. Terriberry's avatar Timothy B. Terriberry
Browse files

Minor UTF-8/UTF-16 cleanups.

- Reject 'not a character' values 0xFFFE and 0xFFFF.
- Remove some unnecessary string length checks.
parent 116b7034
...@@ -32,8 +32,9 @@ static char *utf16_to_utf8(const wchar_t *_src){ ...@@ -32,8 +32,9 @@ static char *utf16_to_utf8(const wchar_t *_src){
dst[di++]=(char)(0x80|c0&0x3F); dst[di++]=(char)(0x80|c0&0x3F);
continue; continue;
} }
else if(c0>=0xD800&&c0<0xDC00&&si+1<len){ else if(c0>=0xD800&&c0<0xDC00){
unsigned c1; unsigned c1;
/*This is safe, because c0 was not 0 and _src is NUL-terminated.*/
c1=_src[si+1]; c1=_src[si+1];
if(c1>=0xDC00&&c1<0xE000){ if(c1>=0xDC00&&c1<0xE000){
unsigned w; unsigned w;
...@@ -48,9 +49,9 @@ static char *utf16_to_utf8(const wchar_t *_src){ ...@@ -48,9 +49,9 @@ static char *utf16_to_utf8(const wchar_t *_src){
continue; continue;
} }
} }
/*Anything else is either a valid 3-byte sequence, or an invalid /*Anything else is either a valid 3-byte sequence, an invalid surrogate
surrogate pair. pair, or 'not a character'.
In the latter case, we just encode the value as a 3-byte In the latter two cases, we just encode the value as a 3-byte
sequence anyway (producing technically invalid UTF-8). sequence anyway (producing technically invalid UTF-8).
Later error handling will detect the problem, with a better Later error handling will detect the problem, with a better
chance of giving a useful error message.*/ chance of giving a useful error message.*/
......
...@@ -153,8 +153,9 @@ static wchar_t *op_utf8_to_utf16(const char *_src){ ...@@ -153,8 +153,9 @@ static wchar_t *op_utf8_to_utf16(const char *_src){
dst[di++]=(wchar_t)c0; dst[di++]=(wchar_t)c0;
continue; continue;
} }
else if(si+1<len){ else{
int c1; int c1;
/*This is safe, because c0 was not 0 and _src is NUL-terminated.*/
c1=(unsigned char)_src[si+1]; c1=(unsigned char)_src[si+1];
if((c1&0xC0)==0x80){ if((c1&0xC0)==0x80){
/*Found at least one continuation byte.*/ /*Found at least one continuation byte.*/
...@@ -169,8 +170,9 @@ static wchar_t *op_utf8_to_utf16(const char *_src){ ...@@ -169,8 +170,9 @@ static wchar_t *op_utf8_to_utf16(const char *_src){
continue; continue;
} }
} }
else if(si+2<len){ else{
int c2; int c2;
/*This is safe, because c1 was not 0 and _src is NUL-terminated.*/
c2=(unsigned char)_src[si+2]; c2=(unsigned char)_src[si+2];
if((c2&0xC0)==0x80){ if((c2&0xC0)==0x80){
/*Found at least two continuation bytes.*/ /*Found at least two continuation bytes.*/
...@@ -178,16 +180,19 @@ static wchar_t *op_utf8_to_utf16(const char *_src){ ...@@ -178,16 +180,19 @@ static wchar_t *op_utf8_to_utf16(const char *_src){
wchar_t w; wchar_t w;
/*Start byte says this is a 3-byte sequence.*/ /*Start byte says this is a 3-byte sequence.*/
w=(c0&0xF)<<12|(c1&0x3F)<<6|c2&0x3F; w=(c0&0xF)<<12|(c1&0x3F)<<6|c2&0x3F;
if(w>=0x800U&&(w<0xD800||w>=0xE000)){ if(w>=0x800U&&(w<0xD800||w>=0xE000)&&w<0xFFFE){
/*This is a 3-byte sequence that is not overlong and not a /*This is a 3-byte sequence that is not overlong, not a
UTF-16 surrogate pair value.*/ UTF-16 surrogate pair value, and not a 'not a character'
value.*/
dst[di++]=w; dst[di++]=w;
si+=2; si+=2;
continue; continue;
} }
} }
else if(si+3<len){ else{
int c3; int c3;
/*This is safe, because c2 was not 0 and _src is
NUL-terminated.*/
c3=(unsigned char)_src[si+3]; c3=(unsigned char)_src[si+3];
if((c3&0xC0)==0x80){ if((c3&0xC0)==0x80){
/*Found at least three continuation bytes.*/ /*Found at least three continuation bytes.*/
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment