From f65dab6f653a94eb3ae9a1a9331943569d44e30e Mon Sep 17 00:00:00 2001 From: "Timothy B. Terriberry" <tterribe@xiph.org> Date: Tue, 6 Aug 2013 11:56:30 -0700 Subject: [PATCH] Add UTF-8 filename support to Windows. As requested here: http://www.hydrogenaudio.org/forums/index.php?showtopic=101817 --- Makefile.am | 16 +++-- examples/opusfile_example.c | 19 ++---- examples/seeking_example.c | 18 ++--- examples/win32utf8.c | 110 ++++++++++++++++++++++++++++++ examples/win32utf8.h | 9 +++ include/opusfile.h | 8 +++ src/stream.c | 131 ++++++++++++++++++++++++++++++++++++ 7 files changed, 279 insertions(+), 32 deletions(-) create mode 100644 examples/win32utf8.c create mode 100644 examples/win32utf8.h diff --git a/Makefile.am b/Makefile.am index e13c7cf..055b5f2 100644 --- a/Makefile.am +++ b/Makefile.am @@ -21,18 +21,22 @@ libopusurl_la_LIBADD = libopusfile.la $(URL_DEPS_LIBS) libopusurl_la_LDFLAGS = -no-undefined \ -version-info @OP_LT_CURRENT@:@OP_LT_REVISION@:@OP_LT_AGE@ -if OP_ENABLE_HTTP +noinst_PROGRAMS = examples/opusfile_example examples/seeking_example + +examples_opusfile_example_SOURCES = examples/opusfile_example.c +examples_seeking_example_SOURCES = examples/seeking_example.c +examples_opusfile_example_LDADD = libopusurl.la libopusfile.la +examples_seeking_example_LDADD = libopusurl.la libopusfile.la + if OP_WIN32 +if OP_ENABLE_HTTP libopusurl_la_SOURCES += src/wincerts.c libopusurl_la_LIBADD += -lws2_32 -lcrypt32 endif +examples_opusfile_example_SOURCES += examples/win32utf8.c +examples_seeking_example_SOURCES += examples/win32utf8.c endif -noinst_PROGRAMS = examples/opusfile_example examples/seeking_example - -examples_opusfile_example_LDADD = libopusurl.la libopusfile.la -examples_seeking_example_LDADD = libopusurl.la libopusfile.la - pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = opusfile.pc opusurl.pc diff --git a/examples/opusfile_example.c b/examples/opusfile_example.c index f6afdba..3f6fcb4 100644 --- a/examples/opusfile_example.c +++ b/examples/opusfile_example.c @@ -21,12 +21,12 @@ #include <stdlib.h> #include <errno.h> #include <string.h> +#include <opusfile.h> #if defined(_WIN32) -/*We need the following two to set stdin/stdout to binary.*/ -# include <io.h> -# include <fcntl.h> +# include "win32utf8.h" +# undef fileno +# define fileno _fileno #endif -#include <opusfile.h> static void print_duration(FILE *_fp,ogg_int64_t _nsamples,int _frac){ ogg_int64_t seconds; @@ -138,15 +138,7 @@ int main(int _argc,const char **_argv){ int is_ssl; int output_seekable; #if defined(_WIN32) -# undef fileno -# define fileno _fileno - /*We need to set stdin/stdout to binary mode. Damn windows.*/ - /*Beware the evil ifdef. We avoid these where we can, but this one we - cannot. - Don't add any more. - You'll probably go to hell if you do.*/ - _setmode(fileno(stdin),_O_BINARY); - _setmode(fileno(stdout),_O_BINARY); + win32_utf8_setup(&_argc,&_argv); #endif if(_argc!=2){ fprintf(stderr,"Usage: %s <file.opus>\n",_argv[0]); @@ -289,6 +281,7 @@ int main(int _argc,const char **_argv){ print_size(stderr,bitrate,1," "); fprintf(stderr,"bps) \r"); pcm_print_offset=pcm_offset; + fflush(stderr); } next_pcm_offset=op_pcm_tell(of); if(pcm_offset+ret!=next_pcm_offset){ diff --git a/examples/seeking_example.c b/examples/seeking_example.c index 65f47ea..43d6848 100644 --- a/examples/seeking_example.c +++ b/examples/seeking_example.c @@ -22,12 +22,12 @@ #include <errno.h> #include <math.h> #include <string.h> +#include <opusfile.h> #if defined(_WIN32) -/*We need the following two to set stdin/stdout to binary.*/ -# include <io.h> -# include <fcntl.h> +# include "win32utf8.h" +# undef fileno +# define fileno _fileno #endif -#include <opusfile.h> /*Use shorts, they're smaller.*/ #if !defined(OP_FIXED_POINT) @@ -261,15 +261,7 @@ int main(int _argc,const char **_argv){ OggOpusFile *of; void *fp; #if defined(_WIN32) -# undef fileno -# define fileno _fileno - /*We need to set stdin/stdout to binary mode. Damn windows.*/ - /*Beware the evil ifdef. We avoid these where we can, but this one we - cannot. - Don't add any more. - You'll probably go to hell if you do.*/ - _setmode(fileno(stdin),_O_BINARY); - _setmode(fileno(stdout),_O_BINARY); + win32_utf8_setup(&_argc,&_argv); #endif if(_argc!=2){ fprintf(stderr,"Usage: %s <file.opus>\n",_argv[0]); diff --git a/examples/win32utf8.c b/examples/win32utf8.c new file mode 100644 index 0000000..6c0cbc5 --- /dev/null +++ b/examples/win32utf8.c @@ -0,0 +1,110 @@ +#if defined(_WIN32) +# include <stdio.h> +# include <stdlib.h> +# include <wchar.h> +/*We need the following two to set stdin/stdout to binary.*/ +# include <io.h> +# include <fcntl.h> +# define WIN32_LEAN_AND_MEAN +# define WIN32_EXTRA_LEAN +# include <windows.h> +# include "win32utf8.h" + +static char *utf16_to_utf8(const wchar_t *_src){ + char *dst; + size_t len; + size_t si; + size_t di; + len=wcslen(_src); + dst=(char *)malloc(sizeof(*dst)*(3*len+1)); + if(dst==NULL)return dst; + for(di=si=0;si<len;si++){ + unsigned c0; + c0=_src[si]; + if(c0<0x80){ + /*Can be represented by a 1-byte sequence.*/ + dst[di++]=(char)c0; + continue; + } + else if(c0<0x800){ + /*Can be represented by a 2-byte sequence.*/ + dst[di++]=(char)(0xC0|c0>>6); + dst[di++]=(char)(0x80|c0&0x3F); + continue; + } + else if(c0>=0xD800&&c0<0xDC00&&si+1<len){ + unsigned c1; + c1=_src[si+1]; + if(c1>=0xDC00&&c1<0xE000){ + unsigned w; + /*Surrogate pair.*/ + w=((c0&0x3FF)<<10|c1&0x3FF)+0x10000; + /*Can be represented by a 4-byte sequence.*/ + dst[di++]=(char)(0xF0|w>>18); + dst[di++]=(char)(0x80|w>>12&0x3F); + dst[di++]=(char)(0x80|w>>6&0x3F); + dst[di++]=(char)(0x80|w&0x3F); + si++; + continue; + } + } + /*Anything else is either a valid 3-byte sequence, or an invalid + surrogate pair. + In the latter case, we just encode the value as a 3-byte + sequence anyway (producing technically invalid UTF-8). + Later error handling will detect the problem, with a better + chance of giving a useful error message.*/ + dst[di++]=(char)(0xE0|c0>>12); + dst[di++]=(char)(0x80|c0>>6&0x3F); + dst[di++]=(char)(0x80|c0&0x3F); + } + dst[di++]='\0'; + return dst; +} + +typedef LPWSTR *(APIENTRY *command_line_to_argv_w_func)(LPCWSTR cmd_line, + int *num_args); + +/*Make a best-effort attempt to support UTF-8 on Windows.*/ +void win32_utf8_setup(int *_argc,const char ***_argv){ + HMODULE hlib; + /*We need to set stdin/stdout to binary mode. + This is unrelated to UTF-8 support, but it's platform specific and we need + to do it in the same places.*/ + _setmode(_fileno(stdin),_O_BINARY); + _setmode(_fileno(stdout),_O_BINARY); + hlib=LoadLibraryA("shell32.dll"); + if(hlib!=NULL){ + command_line_to_argv_w_func command_line_to_argv_w; + /*This function is only available on Windows 2000 or later.*/ + command_line_to_argv_w=(command_line_to_argv_w_func)GetProcAddress(hlib, + "CommandLineToArgvW"); + if(command_line_to_argv_w!=NULL){ + wchar_t **argvw; + int argc; + argvw=(*command_line_to_argv_w)(GetCommandLineW(),&argc); + if(argvw!=NULL){ + int ai; + /*Really, I don't see why argc would ever differ from *_argc, but let's + be paranoid.*/ + if(argc>*_argc)argc=*_argc; + for(ai=0;ai<argc;ai++){ + char *argv; + argv=utf16_to_utf8(argvw[ai]); + if(argv!=NULL)(*_argv)[ai]=argv; + } + *_argc=argc; + LocalFree(argvw); + } + } + FreeLibrary(hlib); + } +# if defined(CP_UTF8) + /*This does not work correctly in all environments (it breaks output in + mingw32 for me), and requires a Unicode font (e.g., when using the default + Raster font, even characters that are available in the font's codepage + won't display properly).*/ + /*SetConsoleOutputCP(CP_UTF8);*/ +# endif +} +#endif diff --git a/examples/win32utf8.h b/examples/win32utf8.h new file mode 100644 index 0000000..7f99171 --- /dev/null +++ b/examples/win32utf8.h @@ -0,0 +1,9 @@ +#if !defined(_win32utf8_H) +# define _win32utf8_H (1) +# if defined(_WIN32) + +/*Make a best-effort attempt to support UTF-8 on Windows.*/ +void win32_utf8_setup(int *_argc,const char ***_argv); + +# endif +#endif diff --git a/include/opusfile.h b/include/opusfile.h index 3478e55..6e1e214 100644 --- a/include/opusfile.h +++ b/include/opusfile.h @@ -630,6 +630,10 @@ struct OpusFileCallbacks{ If there is an error opening the file, nothing will be filled in here. \param _path The path to the file to open. + On Windows, this string must be UTF-8 (to allow access to + files whose names cannot be represented in the current + MBCS code page). + All other systems use the native character encoding. \param _mode The mode to open the file in. \return A stream handle to use with the callbacks, or <code>NULL</code> on error.*/ @@ -663,6 +667,10 @@ OP_WARN_UNUSED_RESULT void *op_fdopen(OpusFileCallbacks *_cb, If there is an error opening the file, nothing will be filled in here. \param _path The path to the file to open. + On Windows, this string must be UTF-8 (to allow access + to files whose names cannot be represented in the + current MBCS code page). + All other systems use the native character encoding. \param _mode The mode to open the file in. \param _stream A stream previously returned by op_fopen(), op_fdopen(), or op_freopen(). diff --git a/src/stream.c b/src/stream.c index 1f1ba2d..1004b27 100644 --- a/src/stream.c +++ b/src/stream.c @@ -103,9 +103,124 @@ static const OpusFileCallbacks OP_FILE_CALLBACKS={ (op_close_func)fclose }; +#if defined(_WIN32) +# include <stddef.h> +# include <errno.h> + +/*Windows doesn't accept UTF-8 by default, and we don't have a wchar_t API, + so if we just pass the path to fopen(), then there'd be no way for a user + of our API to open a Unicode filename. + Instead, we translate from UTF-8 to UTF-16 and use Windows' wchar_t API. + This makes this API more consistent with platforms where the character set + used by fopen is the same as used on disk, which is generally UTF-8, and + with our metadata API, which always uses UTF-8.*/ +static wchar_t *op_utf8_to_utf16(const char *_src){ + wchar_t *dst; + size_t len; + len=strlen(_src); + /*Worst-case output is 1 wide character per 1 input character.*/ + dst=(wchar_t *)malloc(sizeof(*dst)*(len+1)); + if(dst!=NULL){ + size_t si; + size_t di; + for(di=si=0;si<len;si++){ + int c0; + c0=(unsigned char)_src[si]; + if(!(c0&0x80)){ + /*Start byte says this is a 1-byte sequence.*/ + dst[di++]=(wchar_t)c0; + continue; + } + else if(si+1<len){ + int c1; + c1=(unsigned char)_src[si+1]; + if((c1&0xC0)==0x80){ + /*Found at least one continuation byte.*/ + if((c0&0xE0)==0xC0){ + wchar_t w; + /*Start byte says this is a 2-byte sequence.*/ + w=c0&0x1F<<6|c1&0x3F; + if(w>=0x80U){ + /*This is a 2-byte sequence that is not overlong.*/ + dst[di++]=w; + si++; + continue; + } + } + else if(si+2<len){ + int c2; + c2=(unsigned char)_src[si+2]; + if((c2&0xC0)==0x80){ + /*Found at least two continuation bytes.*/ + if((c0&0xF0)==0xE0){ + wchar_t w; + /*Start byte says this is a 3-byte sequence.*/ + w=(c0&0xF)<<12|(c1&0x3F)<<6|c2&0x3F; + if(w>=0x800U&&(w<0xD800||w>=0xE000)){ + /*This is a 3-byte sequence that is not overlong and not a + UTF-16 surrogate pair value.*/ + dst[di++]=w; + si+=2; + continue; + } + } + else if(si+3<len){ + int c3; + c3=(unsigned char)_src[si+3]; + if((c3&0xC0)==0x80){ + /*Found at least three continuation bytes.*/ + if((c0&0xF8)==0xF0){ + opus_uint32 w; + /*Start byte says this is a 4-byte sequence.*/ + w=(c0&7)<<18|(c1&0x3F)<<12|(c2&0x3F)<<6&(c3&0x3F); + if(w>=0x10000U&&w<0x110000U){ + /*This is a 4-byte sequence that is not overlong and not + greater than the largest valid Unicode code point. + Convert it to a surrogate pair.*/ + w-=0x10000; + dst[di++]=(wchar_t)(0xD800+(w>>10)); + dst[di++]=(wchar_t)(0xDC00+(w&0x3FF)); + si+=3; + continue; + } + } + } + } + } + } + } + } + /*If we got here, we encountered an illegal UTF-8 sequence.*/ + free(dst); + return NULL; + } + OP_ASSERT(di<=len); + dst[di]='\0'; + } + return dst; +} + +#endif + void *op_fopen(OpusFileCallbacks *_cb,const char *_path,const char *_mode){ FILE *fp; +#if !defined(_WIN32) fp=fopen(_path,_mode); +#else + fp=NULL; + if(_path==NULL||_mode==NULL)errno=EINVAL; + else{ + wchar_t *wpath; + wchar_t *wmode; + wpath=op_utf8_to_utf16(_path); + wmode=op_utf8_to_utf16(_mode); + if(wmode==NULL)errno=EINVAL; + else if(wpath==NULL)errno=ENOENT; + else fp=_wfopen(wpath,wmode); + free(wmode); + free(wpath); + } +#endif if(fp!=NULL)*_cb=*&OP_FILE_CALLBACKS; return fp; } @@ -120,7 +235,23 @@ void *op_fdopen(OpusFileCallbacks *_cb,int _fd,const char *_mode){ void *op_freopen(OpusFileCallbacks *_cb,const char *_path,const char *_mode, void *_stream){ FILE *fp; +#if !defined(_WIN32) fp=freopen(_path,_mode,(FILE *)_stream); +#else + fp=NULL; + if(_path==NULL||_mode==NULL)errno=EINVAL; + else{ + wchar_t *wpath; + wchar_t *wmode; + wpath=op_utf8_to_utf16(_path); + wmode=op_utf8_to_utf16(_mode); + if(wmode==NULL)errno=EINVAL; + else if(wpath==NULL)errno=ENOENT; + else fp=_wfreopen(wpath,wmode,(FILE *)_stream); + free(wmode); + free(wpath); + } +#endif if(fp!=NULL)*_cb=*&OP_FILE_CALLBACKS; return fp; } -- GitLab