Commit cfa59561 authored by Timothy B. Terriberry's avatar Timothy B. Terriberry
Browse files

Minor UTF-8/UTF-16 cleanups.

- Reject 'not a character' values 0xFFFE and 0xFFFF.
- Remove some unnecessary string length checks.
parent 116b7034
......@@ -32,8 +32,9 @@ static char *utf16_to_utf8(const wchar_t *_src){
dst[di++]=(char)(0x80|c0&0x3F);
continue;
}
else if(c0>=0xD800&&c0<0xDC00&&si+1<len){
else if(c0>=0xD800&&c0<0xDC00){
unsigned c1;
/*This is safe, because c0 was not 0 and _src is NUL-terminated.*/
c1=_src[si+1];
if(c1>=0xDC00&&c1<0xE000){
unsigned w;
......@@ -48,9 +49,9 @@ static char *utf16_to_utf8(const wchar_t *_src){
continue;
}
}
/*Anything else is either a valid 3-byte sequence, or an invalid
surrogate pair.
In the latter case, we just encode the value as a 3-byte
/*Anything else is either a valid 3-byte sequence, an invalid surrogate
pair, or 'not a character'.
In the latter two cases, we just encode the value as a 3-byte
sequence anyway (producing technically invalid UTF-8).
Later error handling will detect the problem, with a better
chance of giving a useful error message.*/
......
......@@ -153,8 +153,9 @@ static wchar_t *op_utf8_to_utf16(const char *_src){
dst[di++]=(wchar_t)c0;
continue;
}
else if(si+1<len){
else{
int c1;
/*This is safe, because c0 was not 0 and _src is NUL-terminated.*/
c1=(unsigned char)_src[si+1];
if((c1&0xC0)==0x80){
/*Found at least one continuation byte.*/
......@@ -169,8 +170,9 @@ static wchar_t *op_utf8_to_utf16(const char *_src){
continue;
}
}
else if(si+2<len){
else{
int c2;
/*This is safe, because c1 was not 0 and _src is NUL-terminated.*/
c2=(unsigned char)_src[si+2];
if((c2&0xC0)==0x80){
/*Found at least two continuation bytes.*/
......@@ -178,16 +180,19 @@ static wchar_t *op_utf8_to_utf16(const char *_src){
wchar_t w;
/*Start byte says this is a 3-byte sequence.*/
w=(c0&0xF)<<12|(c1&0x3F)<<6|c2&0x3F;
if(w>=0x800U&&(w<0xD800||w>=0xE000)){
/*This is a 3-byte sequence that is not overlong and not a
UTF-16 surrogate pair value.*/
if(w>=0x800U&&(w<0xD800||w>=0xE000)&&w<0xFFFE){
/*This is a 3-byte sequence that is not overlong, not a
UTF-16 surrogate pair value, and not a 'not a character'
value.*/
dst[di++]=w;
si+=2;
continue;
}
}
else if(si+3<len){
else{
int c3;
/*This is safe, because c2 was not 0 and _src is
NUL-terminated.*/
c3=(unsigned char)_src[si+3];
if((c3&0xC0)==0x80){
/*Found at least three continuation bytes.*/
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment