From f65dab6f653a94eb3ae9a1a9331943569d44e30e Mon Sep 17 00:00:00 2001
From: "Timothy B. Terriberry" <>
Date: Tue, 6 Aug 2013 11:56:30 -0700
Subject: [PATCH] Add UTF-8 filename support to Windows.

As requested here:
---                 |  16 +++--
 examples/opusfile_example.c |  19 ++----
 examples/seeking_example.c  |  18 ++---
 examples/win32utf8.c        | 110 ++++++++++++++++++++++++++++++
 examples/win32utf8.h        |   9 +++
 include/opusfile.h          |   8 +++
 src/stream.c                | 131 ++++++++++++++++++++++++++++++++++++
 7 files changed, 279 insertions(+), 32 deletions(-)
 create mode 100644 examples/win32utf8.c
 create mode 100644 examples/win32utf8.h

diff --git a/ b/
index e13c7cf..055b5f2 100644
--- a/
+++ b/
@@ -21,18 +21,22 @@ libopusurl_la_LIBADD = $(URL_DEPS_LIBS)
 libopusurl_la_LDFLAGS = -no-undefined \
  -version-info @OP_LT_CURRENT@:@OP_LT_REVISION@:@OP_LT_AGE@
+noinst_PROGRAMS = examples/opusfile_example examples/seeking_example
+examples_opusfile_example_SOURCES = examples/opusfile_example.c
+examples_seeking_example_SOURCES = examples/seeking_example.c
+examples_opusfile_example_LDADD =
+examples_seeking_example_LDADD =
 if OP_WIN32
 libopusurl_la_SOURCES += src/wincerts.c
 libopusurl_la_LIBADD += -lws2_32 -lcrypt32
+examples_opusfile_example_SOURCES += examples/win32utf8.c
+examples_seeking_example_SOURCES += examples/win32utf8.c
-noinst_PROGRAMS = examples/opusfile_example examples/seeking_example
-examples_opusfile_example_LDADD =
-examples_seeking_example_LDADD =
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = opusfile.pc opusurl.pc
diff --git a/examples/opusfile_example.c b/examples/opusfile_example.c
index f6afdba..3f6fcb4 100644
--- a/examples/opusfile_example.c
+++ b/examples/opusfile_example.c
@@ -21,12 +21,12 @@
 #include <stdlib.h>
 #include <errno.h>
 #include <string.h>
+#include <opusfile.h>
 #if defined(_WIN32)
-/*We need the following two to set stdin/stdout to binary.*/
-# include <io.h>
-# include <fcntl.h>
+# include "win32utf8.h"
+# undef fileno
+# define fileno _fileno
-#include <opusfile.h>
 static void print_duration(FILE *_fp,ogg_int64_t _nsamples,int _frac){
   ogg_int64_t seconds;
@@ -138,15 +138,7 @@ int main(int _argc,const char **_argv){
   int           is_ssl;
   int           output_seekable;
 #if defined(_WIN32)
-# undef fileno
-# define fileno _fileno
-  /*We need to set stdin/stdout to binary mode. Damn windows.*/
-  /*Beware the evil ifdef. We avoid these where we can, but this one we
-     cannot.
-    Don't add any more.
-    You'll probably go to hell if you do.*/
-  _setmode(fileno(stdin),_O_BINARY);
-  _setmode(fileno(stdout),_O_BINARY);
+  win32_utf8_setup(&_argc,&_argv);
     fprintf(stderr,"Usage: %s <file.opus>\n",_argv[0]);
@@ -289,6 +281,7 @@ int main(int _argc,const char **_argv){
         print_size(stderr,bitrate,1," ");
         fprintf(stderr,"bps)                    \r");
+        fflush(stderr);
diff --git a/examples/seeking_example.c b/examples/seeking_example.c
index 65f47ea..43d6848 100644
--- a/examples/seeking_example.c
+++ b/examples/seeking_example.c
@@ -22,12 +22,12 @@
 #include <errno.h>
 #include <math.h>
 #include <string.h>
+#include <opusfile.h>
 #if defined(_WIN32)
-/*We need the following two to set stdin/stdout to binary.*/
-# include <io.h>
-# include <fcntl.h>
+# include "win32utf8.h"
+# undef fileno
+# define fileno _fileno
-#include <opusfile.h>
 /*Use shorts, they're smaller.*/
 #if !defined(OP_FIXED_POINT)
@@ -261,15 +261,7 @@ int main(int _argc,const char **_argv){
   OggOpusFile       *of;
   void              *fp;
 #if defined(_WIN32)
-# undef fileno
-# define fileno _fileno
-  /*We need to set stdin/stdout to binary mode. Damn windows.*/
-  /*Beware the evil ifdef. We avoid these where we can, but this one we
-     cannot.
-    Don't add any more.
-    You'll probably go to hell if you do.*/
-  _setmode(fileno(stdin),_O_BINARY);
-  _setmode(fileno(stdout),_O_BINARY);
+  win32_utf8_setup(&_argc,&_argv);
     fprintf(stderr,"Usage: %s <file.opus>\n",_argv[0]);
diff --git a/examples/win32utf8.c b/examples/win32utf8.c
new file mode 100644
index 0000000..6c0cbc5
--- /dev/null
+++ b/examples/win32utf8.c
@@ -0,0 +1,110 @@
+#if defined(_WIN32)
+# include <stdio.h>
+# include <stdlib.h>
+# include <wchar.h>
+/*We need the following two to set stdin/stdout to binary.*/
+# include <io.h>
+# include <fcntl.h>
+# define WIN32_LEAN_AND_MEAN
+# define WIN32_EXTRA_LEAN
+# include <windows.h>
+# include "win32utf8.h"
+static char *utf16_to_utf8(const wchar_t *_src){
+  char   *dst;
+  size_t  len;
+  size_t  si;
+  size_t  di;
+  len=wcslen(_src);
+  dst=(char *)malloc(sizeof(*dst)*(3*len+1));
+  if(dst==NULL)return dst;
+  for(di=si=0;si<len;si++){
+    unsigned c0;
+    c0=_src[si];
+    if(c0<0x80){
+      /*Can be represented by a 1-byte sequence.*/
+      dst[di++]=(char)c0;
+      continue;
+    }
+    else if(c0<0x800){
+      /*Can be represented by a 2-byte sequence.*/
+      dst[di++]=(char)(0xC0|c0>>6);
+      dst[di++]=(char)(0x80|c0&0x3F);
+      continue;
+    }
+    else if(c0>=0xD800&&c0<0xDC00&&si+1<len){
+      unsigned c1;
+      c1=_src[si+1];
+      if(c1>=0xDC00&&c1<0xE000){
+        unsigned w;
+        /*Surrogate pair.*/
+        w=((c0&0x3FF)<<10|c1&0x3FF)+0x10000;
+        /*Can be represented by a 4-byte sequence.*/
+        dst[di++]=(char)(0xF0|w>>18);
+        dst[di++]=(char)(0x80|w>>12&0x3F);
+        dst[di++]=(char)(0x80|w>>6&0x3F);
+        dst[di++]=(char)(0x80|w&0x3F);
+        si++;
+        continue;
+      }
+    }
+    /*Anything else is either a valid 3-byte sequence, or an invalid
+       surrogate pair.
+      In the latter case, we just encode the value as a 3-byte
+       sequence anyway (producing technically invalid UTF-8).
+      Later error handling will detect the problem, with a better
+       chance of giving a useful error message.*/
+    dst[di++]=(char)(0xE0|c0>>12);
+    dst[di++]=(char)(0x80|c0>>6&0x3F);
+    dst[di++]=(char)(0x80|c0&0x3F);
+  }
+  dst[di++]='\0';
+  return dst;
+typedef LPWSTR *(APIENTRY *command_line_to_argv_w_func)(LPCWSTR cmd_line,
+ int *num_args);
+/*Make a best-effort attempt to support UTF-8 on Windows.*/
+void win32_utf8_setup(int *_argc,const char ***_argv){
+  HMODULE hlib;
+  /*We need to set stdin/stdout to binary mode.
+    This is unrelated to UTF-8 support, but it's platform specific and we need
+     to do it in the same places.*/
+  _setmode(_fileno(stdin),_O_BINARY);
+  _setmode(_fileno(stdout),_O_BINARY);
+  hlib=LoadLibraryA("shell32.dll");
+  if(hlib!=NULL){
+    command_line_to_argv_w_func command_line_to_argv_w;
+    /*This function is only available on Windows 2000 or later.*/
+    command_line_to_argv_w=(command_line_to_argv_w_func)GetProcAddress(hlib,
+     "CommandLineToArgvW");
+    if(command_line_to_argv_w!=NULL){
+      wchar_t **argvw;
+      int       argc;
+      argvw=(*command_line_to_argv_w)(GetCommandLineW(),&argc);
+      if(argvw!=NULL){
+        int ai;
+        /*Really, I don't see why argc would ever differ from *_argc, but let's
+           be paranoid.*/
+        if(argc>*_argc)argc=*_argc;
+        for(ai=0;ai<argc;ai++){
+          char *argv;
+          argv=utf16_to_utf8(argvw[ai]);
+          if(argv!=NULL)(*_argv)[ai]=argv;
+        }
+        *_argc=argc;
+        LocalFree(argvw);
+      }
+    }
+    FreeLibrary(hlib);
+  }
+# if defined(CP_UTF8)
+  /*This does not work correctly in all environments (it breaks output in
+     mingw32 for me), and requires a Unicode font (e.g., when using the default
+     Raster font, even characters that are available in the font's codepage
+     won't display properly).*/
+  /*SetConsoleOutputCP(CP_UTF8);*/
+# endif
diff --git a/examples/win32utf8.h b/examples/win32utf8.h
new file mode 100644
index 0000000..7f99171
--- /dev/null
+++ b/examples/win32utf8.h
@@ -0,0 +1,9 @@
+#if !defined(_win32utf8_H)
+# define _win32utf8_H (1)
+# if defined(_WIN32)
+/*Make a best-effort attempt to support UTF-8 on Windows.*/
+void win32_utf8_setup(int *_argc,const char ***_argv);
+# endif
diff --git a/include/opusfile.h b/include/opusfile.h
index 3478e55..6e1e214 100644
--- a/include/opusfile.h
+++ b/include/opusfile.h
@@ -630,6 +630,10 @@ struct OpusFileCallbacks{
                      If there is an error opening the file, nothing will be
                       filled in here.
    \param      _path The path to the file to open.
+                     On Windows, this string must be UTF-8 (to allow access to
+                      files whose names cannot be represented in the current
+                      MBCS code page).
+                     All other systems use the native character encoding.
    \param      _mode The mode to open the file in.
    \return A stream handle to use with the callbacks, or <code>NULL</code> on
@@ -663,6 +667,10 @@ OP_WARN_UNUSED_RESULT void *op_fdopen(OpusFileCallbacks *_cb,
                        If there is an error opening the file, nothing will be
                         filled in here.
    \param      _path   The path to the file to open.
+                       On Windows, this string must be UTF-8 (to allow access
+                        to files whose names cannot be represented in the
+                        current MBCS code page).
+                       All other systems use the native character encoding.
    \param      _mode   The mode to open the file in.
    \param      _stream A stream previously returned by op_fopen(), op_fdopen(),
                         or op_freopen().
diff --git a/src/stream.c b/src/stream.c
index 1f1ba2d..1004b27 100644
--- a/src/stream.c
+++ b/src/stream.c
@@ -103,9 +103,124 @@ static const OpusFileCallbacks OP_FILE_CALLBACKS={
+#if defined(_WIN32)
+# include <stddef.h>
+# include <errno.h>
+/*Windows doesn't accept UTF-8 by default, and we don't have a wchar_t API,
+   so if we just pass the path to fopen(), then there'd be no way for a user
+   of our API to open a Unicode filename.
+  Instead, we translate from UTF-8 to UTF-16 and use Windows' wchar_t API.
+  This makes this API more consistent with platforms where the character set
+   used by fopen is the same as used on disk, which is generally UTF-8, and
+   with our metadata API, which always uses UTF-8.*/
+static wchar_t *op_utf8_to_utf16(const char *_src){
+  wchar_t *dst;
+  size_t   len;
+  len=strlen(_src);
+  /*Worst-case output is 1 wide character per 1 input character.*/
+  dst=(wchar_t *)malloc(sizeof(*dst)*(len+1));
+  if(dst!=NULL){
+    size_t si;
+    size_t di;
+    for(di=si=0;si<len;si++){
+      int c0;
+      c0=(unsigned char)_src[si];
+      if(!(c0&0x80)){
+        /*Start byte says this is a 1-byte sequence.*/
+        dst[di++]=(wchar_t)c0;
+        continue;
+      }
+      else if(si+1<len){
+        int c1;
+        c1=(unsigned char)_src[si+1];
+        if((c1&0xC0)==0x80){
+          /*Found at least one continuation byte.*/
+          if((c0&0xE0)==0xC0){
+            wchar_t w;
+            /*Start byte says this is a 2-byte sequence.*/
+            w=c0&0x1F<<6|c1&0x3F;
+            if(w>=0x80U){
+              /*This is a 2-byte sequence that is not overlong.*/
+              dst[di++]=w;
+              si++;
+              continue;
+            }
+          }
+          else if(si+2<len){
+            int c2;
+            c2=(unsigned char)_src[si+2];
+            if((c2&0xC0)==0x80){
+              /*Found at least two continuation bytes.*/
+              if((c0&0xF0)==0xE0){
+                wchar_t w;
+                /*Start byte says this is a 3-byte sequence.*/
+                w=(c0&0xF)<<12|(c1&0x3F)<<6|c2&0x3F;
+                if(w>=0x800U&&(w<0xD800||w>=0xE000)){
+                  /*This is a 3-byte sequence that is not overlong and not a
+                     UTF-16 surrogate pair value.*/
+                  dst[di++]=w;
+                  si+=2;
+                  continue;
+                }
+              }
+              else if(si+3<len){
+                int c3;
+                c3=(unsigned char)_src[si+3];
+                if((c3&0xC0)==0x80){
+                  /*Found at least three continuation bytes.*/
+                  if((c0&0xF8)==0xF0){
+                    opus_uint32 w;
+                    /*Start byte says this is a 4-byte sequence.*/
+                    w=(c0&7)<<18|(c1&0x3F)<<12|(c2&0x3F)<<6&(c3&0x3F);
+                    if(w>=0x10000U&&w<0x110000U){
+                      /*This is a 4-byte sequence that is not overlong and not
+                         greater than the largest valid Unicode code point.
+                        Convert it to a surrogate pair.*/
+                      w-=0x10000;
+                      dst[di++]=(wchar_t)(0xD800+(w>>10));
+                      dst[di++]=(wchar_t)(0xDC00+(w&0x3FF));
+                      si+=3;
+                      continue;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      /*If we got here, we encountered an illegal UTF-8 sequence.*/
+      free(dst);
+      return NULL;
+    }
+    OP_ASSERT(di<=len);
+    dst[di]='\0';
+  }
+  return dst;
 void *op_fopen(OpusFileCallbacks *_cb,const char *_path,const char *_mode){
   FILE *fp;
+#if !defined(_WIN32)
+  fp=NULL;
+  if(_path==NULL||_mode==NULL)errno=EINVAL;
+  else{
+    wchar_t *wpath;
+    wchar_t *wmode;
+    wpath=op_utf8_to_utf16(_path);
+    wmode=op_utf8_to_utf16(_mode);
+    if(wmode==NULL)errno=EINVAL;
+    else if(wpath==NULL)errno=ENOENT;
+    else fp=_wfopen(wpath,wmode);
+    free(wmode);
+    free(wpath);
+  }
   return fp;
@@ -120,7 +235,23 @@ void *op_fdopen(OpusFileCallbacks *_cb,int _fd,const char *_mode){
 void *op_freopen(OpusFileCallbacks *_cb,const char *_path,const char *_mode,
  void *_stream){
   FILE *fp;
+#if !defined(_WIN32)
   fp=freopen(_path,_mode,(FILE *)_stream);
+  fp=NULL;
+  if(_path==NULL||_mode==NULL)errno=EINVAL;
+  else{
+    wchar_t *wpath;
+    wchar_t *wmode;
+    wpath=op_utf8_to_utf16(_path);
+    wmode=op_utf8_to_utf16(_mode);
+    if(wmode==NULL)errno=EINVAL;
+    else if(wpath==NULL)errno=ENOENT;
+    else fp=_wfreopen(wpath,wmode,(FILE *)_stream);
+    free(wmode);
+    free(wpath);
+  }
   return fp;