Commit 86fb1a3e authored by Michael Smith's avatar Michael Smith

Updates and bugfixes, plus extra tests, from Edmund Evans.

svn path=/trunk/vorbis-tools/; revision=2185
parent f41242bc
......@@ -88,7 +88,7 @@ int utf8_mbtowc(int *pwc, const char *s, size_t n)
else if (c < 0xc2)
return -1;
else if (c < 0xe0) {
if (n >= 2) {
if (n >= 2 && (s[1] & 0xc0) == 0x80) {
if (pwc)
*pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
return 2;
......@@ -176,7 +176,7 @@ int utf8_wctomb(char *s, int wc1)
*/
struct charset {
int min, max;
int max;
int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
int (*wctomb)(void *table, char *s, int wc);
void *map;
......@@ -192,11 +192,6 @@ int charset_wctomb(struct charset *charset, char *s, int wc)
return (*charset->wctomb)(charset->map, s, wc);
}
int charset_min(struct charset *charset)
{
return charset->min;
}
int charset_max(struct charset *charset)
{
return charset->max;
......@@ -398,21 +393,21 @@ int wctomb_8bit(void *map1, char *s, int wc1)
*/
struct charset charset_utf8 = {
1, 6,
6,
&mbtowc_utf8,
&wctomb_utf8,
0
};
struct charset charset_iso1 = {
1, 1,
1,
&mbtowc_iso1,
&wctomb_iso1,
0
};
struct charset charset_ascii = {
1, 1,
1,
&mbtowc_ascii,
&wctomb_ascii,
0
......@@ -449,7 +444,6 @@ struct charset *charset_find(const char *code)
maps[i].charset = 0;
}
else {
maps[i].charset->min = 1;
maps[i].charset->max = 1;
maps[i].charset->mbtowc = &mbtowc_8bit;
maps[i].charset->wctomb = &wctomb_8bit;
......@@ -488,7 +482,7 @@ int charset_convert(const char *fromcode, const char *tocode,
if (!charset1 || !charset2 )
return -1;
tobuf = (char *)malloc((fromlen / charset1->min) * charset2->max + 1);
tobuf = (char *)malloc(fromlen * charset2->max + 1);
if (!tobuf)
return -2;
......
/*
* Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <stdlib.h>
/*
* These functions are like the C library's mbtowc() and wctomb(),
* but instead of depending on the locale they always work in UTF-8,
* and they use int instead of wchar_t.
*/
int utf8_mbtowc(int *pwc, const char *s, size_t n);
int utf8_wctomb(char *s, int wc);
/*
* This is an object-oriented version of mbtowc() and wctomb().
* The caller first uses charset_find() to get a pointer to struct
* charset, then uses the mbtowc() and wctomb() methods on it.
* The function charset_max() gives the maximum length of a
* multibyte character in that encoding.
* This API is only appropriate for stateless encodings like UTF-8
* or ISO-8859-3, but I have no intention of implementing anything
* other than UTF-8 and 8-bit encodings.
*
* MINOR BUG: If there is no memory charset_find() may return 0 and
* there is no way to distinguish this case from an unknown encoding.
*/
struct charset;
struct charset *charset_find(const char *code);
int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n);
int charset_wctomb(struct charset *charset, char *s, int wc);
int charset_max(struct charset *charset);
/*
* Function to convert a buffer from one encoding to another.
* Invalid bytes are replaced by '#', and characters that are
* not available in the target encoding are replaced by '?'.
* Each of TO and TOLEN may be zero if the result is not wanted.
* The input or output may contain null bytes, but the output
* buffer is also null-terminated, so it is all right to
* use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
*
* Return value:
*
* -2 : memory allocation failed
* -1 : unknown encoding
* 0 : data was converted exactly
* 1 : valid data was converted approximately (using '?')
* 2 : input was invalid (but still converted, using '#')
*/
int charset_convert(const char *fromcode, const char *tocode,
const char *from, size_t fromlen,
char **to, size_t *tolen);
......@@ -30,13 +30,22 @@ void test_any(struct charset *charset)
/* Decoder */
assert(charset_mbtowc(charset, 0, (char *)(-1), 0) == 0);
assert(charset_mbtowc(charset, 0, 0, 0) == 0);
assert(charset_mbtowc(charset, 0, 0, 1) == 0);
assert(charset_mbtowc(charset, 0, (char *)(-1), 0) == 0);
assert(charset_mbtowc(charset, &wc, "x", 0) == 0);
assert(charset_mbtowc(charset, &wc, "x", 1) == 1 && wc == 'x');
assert(charset_mbtowc(charset, &wc, "x", 2) == 1 && wc == 'x');
assert(charset_mbtowc(charset, 0, "a", 0) == 0);
assert(charset_mbtowc(charset, 0, "", 1) == 0);
assert(charset_mbtowc(charset, 0, "b", 1) == 1);
assert(charset_mbtowc(charset, 0, "", 2) == 0);
assert(charset_mbtowc(charset, 0, "c", 2) == 1);
wc = 'x';
assert(charset_mbtowc(charset, &wc, "a", 0) == 0 && wc == 'x');
assert(charset_mbtowc(charset, &wc, "", 1) == 0 && wc == 0);
assert(charset_mbtowc(charset, &wc, "b", 1) == 1 && wc == 'b');
assert(charset_mbtowc(charset, &wc, "", 2) == 0 && wc == 0);
assert(charset_mbtowc(charset, &wc, "c", 2) == 1 && wc == 'c');
/* Encoder */
......@@ -81,6 +90,30 @@ void test_utf8()
assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\277", 9) == 6 &&
wc == 0x7fffffff);
assert(charset_mbtowc(charset, &wc, "\302\000", 2) == -1);
assert(charset_mbtowc(charset, &wc, "\302\300", 2) == -1);
assert(charset_mbtowc(charset, &wc, "\340\040\200", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\340\340\200", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\340\240\000", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\340\240\300", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\360\020\200\200", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\360\320\200\200", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\360\220\000\200", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\360\220\300\200", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\360\220\200\000", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\360\220\200\300", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\375\077\277\277\277\277", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\375\377\277\277\277\277", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\375\277\077\277\277\277", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\375\277\377\277\277\277", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\375\277\277\277\077\277", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\375\277\277\277\377\277", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\077", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\377", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\376\277\277\277\277\277", 9) == -1);
assert(charset_mbtowc(charset, &wc, "\377\277\277\277\277\277", 9) == -1);
/* Encoder */
strcpy(s, ".......");
assert(charset_wctomb(charset, s, 1 << 31) == -1 &&
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment