Commit a55d565b authored by Jan Gerber's avatar Jan Gerber
Browse files

if available, user iconv for character conversion, patch from ogg.k

parent 38bb5674
......@@ -149,6 +149,12 @@ if env['libkate']:
You can also run ./get_libkate.sh (for more information see INSTALL)
or update PKG_CONFIG_PATH to point to libkate's source folder
"""
if conf.CheckCHeader('iconv.h'):
env.Append(CCFLAGS=[
'-DHAVE_ICONV'
])
env = conf.Finish()
# ffmpeg2theora
......
......@@ -216,7 +216,7 @@ class SimpleTheoraEncoder(wx.Frame):
self.removeItem.Enable()
def OnClickAdd(self, event):
result = addVideoDialog(self, theoraenc.hasKate)
result = addVideoDialog(self, theoraenc.hasKate, theoraenc.hasIconv)
time.sleep(0.5)
if result['ok']:
self.addItemToQueue(result['videoFile'], result)
......
......@@ -4,6 +4,7 @@
import os
from os.path import basename
import time
import subprocess
import wx
#import wx.lib.langlistctrl
......@@ -23,7 +24,7 @@ from wx.lib.mixins.listctrl import ListCtrlAutoWidthMixin
class SubtitlesProperties(wx.Dialog):
def __init__(
self, parent, ID, title,
language, category, encoding, file,
language, category, encoding, file, hasIconv,
size=wx.DefaultSize, pos=wx.DefaultPosition,
style=wx.DEFAULT_DIALOG_STYLE,
):
......@@ -31,6 +32,8 @@ class SubtitlesProperties(wx.Dialog):
pre.Create(parent, ID, title, pos, size, style)
self.PostCreate(pre)
self.hasIconv = hasIconv
# defaults
if language == '':
language = 'en'
......@@ -64,8 +67,10 @@ class SubtitlesProperties(wx.Dialog):
self.addProperty(mainBox, 'Category', self.categoryWidget, self.OnCategoryHelp)
# encoding
encodings = ['UTF-8', 'ISO-8859-1']
self.encodingWidget = wx.Choice(self, -1, (80,-1), choices=encodings, name=encoding)
if hasIconv:
self.encodingWidget = wx.ComboBox(self, -1, encoding, (80,-1), wx.DefaultSize, self.BuildEncodingsList(self.hasIconv), wx.CB_SIMPLE)
else:
self.encodingWidget = wx.Choice(self, -1, (80,-1), choices=self.BuildEncodingsList(self.hasIconv))
self.addProperty(mainBox, 'Encoding', self.encodingWidget, self.OnEncodingHelp)
#Buttons
......@@ -134,11 +139,15 @@ class SubtitlesProperties(wx.Dialog):
'If the language tag needed is not available in the list, a custom one may be entered.\n')
def OnEncodingHelp(self, event):
iconv_blurb = ''
if self.hasIconv:
iconv_blurb = 'ffmpeg2theora was built with iconv support, so can also convert any encoding that is supported by iconv.\n'
self.DisplayHelp(
'Kate streams are encoded in UTF-8 (a Unicode character encoding that allows to represent '+
'pretty much any existing script.\n'+
'If the input file is not already encoded in UTF-8, it will need converting to UTF-8 first.\n'+
'ffmpeg2theora can convert ISO-8859-1 (also known as latin1) encoding directly.\n'+
iconv_blurb+
'Files in other encodings will have to be converted manually in order to be used. See the '+
'subtitles.txt documentation for more information on how to manually convert files.\n')
......@@ -175,16 +184,13 @@ class SubtitlesProperties(wx.Dialog):
# add in whatever's known from 'locale -a' - this works fine if locale isn't found,
# but i'm not sure what that'll do if we get another program named locale that spews
# random stuff to stdout :)
f = os.popen('locale -a')
line = f.readline()
while line:
p = subprocess.Popen(['locale', '-a'], shell=False, stdout=subprocess.PIPE, close_fds=True)
data, err = p.communicate()
for line in data.strip().split('\n'):
line = self.ExtractLanguage(line)
if line != '' and line != 'C' and line != 'POSIX':
if line != '' and line != 'C' and line != 'POSIX' and line not in languages:
languages.append(line)
line = f.readline()
f.close()
#oneliner from german python forum => unique list
languages = [languages[i] for i in xrange(len(languages)) if languages[i] not in languages[:i]]
languages.sort()
return languages
......@@ -197,8 +203,23 @@ class SubtitlesProperties(wx.Dialog):
line = line.split('\r')[0] # Mac or Windows
return line
def addSubtitlesPropertiesDialog(parent, language, category, encoding, file):
dlg = SubtitlesProperties(parent, -1, "Add subtitles", language, category, encoding, file, size=(490, 560), style=wx.DEFAULT_DIALOG_STYLE)
def BuildEncodingsList(self, hasIconv):
# start with a known basic set, that ffmpeg2theora can handle without iconv
encodings = ['UTF-8', 'ISO-8859-1']
# this creates a *huge* spammy list with my version of iconv...
if hasIconv:
# add in whatever iconv knows about
p = subprocess.Popen(['iconv', '-l'], shell=False, stdout=subprocess.PIPE, close_fds=True)
data, stderr = p.communicate()
for line in data.strip().split('\n'):
line = line.split('/')[0] # stop at a /
if not line in encodings:
encodings.append(line)
return encodings
def addSubtitlesPropertiesDialog(parent, language, category, encoding, file, hasIconv):
dlg = SubtitlesProperties(parent, -1, "Add subtitles", language, category, encoding, file, hasIconv, size=(490, 560), style=wx.DEFAULT_DIALOG_STYLE)
dlg.CenterOnScreen()
val = dlg.ShowModal()
result = dict()
......@@ -211,7 +232,10 @@ def addSubtitlesPropertiesDialog(parent, language, category, encoding, file):
# result['subtitlesLanguage'] = dlg.languageWidget.GetValue()
result['subtitlesLanguage'] = dlg.languageWidget.GetValue()
result['subtitlesCategory'] = dlg.categoryWidget.GetValue()
result['subtitlesEncoding'] = dlg.encodingWidget.GetStringSelection()
if hasIconv:
result['subtitlesEncoding'] = dlg.encodingWidget.GetValue()
else:
result['subtitlesEncoding'] = dlg.encodingWidget.GetStringSelection()
print result
else:
result['ok'] = False
......
......@@ -12,11 +12,14 @@ import theoraenc
class AddVideoDialog(wx.Dialog):
def __init__(
self, parent, ID, title, hasKate, size=wx.DefaultSize, pos=wx.DefaultPosition,
self, parent, ID, title, hasKate, hasIconv,
size=wx.DefaultSize, pos=wx.DefaultPosition,
style=wx.DEFAULT_DIALOG_STYLE,
):
self.videoFile = ''
self.hasKate = hasKate
self.hasIconv = hasIconv
pre = wx.PreDialog()
#pre.SetExtraStyle(wx.DIALOG_EX_CONTEXTHELP)
......@@ -360,7 +363,7 @@ class AddVideoDialog(wx.Dialog):
category = self.subtitles.GetItem(idx, 1).GetText()
encoding = self.subtitles.GetItem(idx, 2).GetText()
file = self.subtitles.GetItem(idx, 3).GetText()
result = addSubtitlesPropertiesDialog(self, language, category, encoding, file)
result = addSubtitlesPropertiesDialog(self, language, category, encoding, file, self.hasIconv)
time.sleep(0.5) # why ? race condition ?
if result['ok']:
self.subtitles.SetStringItem(idx, 0, result['subtitlesLanguage'])
......@@ -372,8 +375,8 @@ class AddVideoDialog(wx.Dialog):
return False
def addVideoDialog(parent, hasKate):
dlg = AddVideoDialog(parent, -1, "Add Video", hasKate, size=(490, 560), style=wx.DEFAULT_DIALOG_STYLE)
def addVideoDialog(parent, hasKate, hasIconv):
dlg = AddVideoDialog(parent, -1, "Add Video", hasKate, hasIconv, size=(490, 560), style=wx.DEFAULT_DIALOG_STYLE)
dlg.CenterOnScreen()
val = dlg.ShowModal()
result = dict()
......
......@@ -31,16 +31,20 @@ def probe_ffmpeg2theora():
def probe_kate(ffmpeg2theora):
hasKate = False
cmd = ffmpeg2theora + ' --help'
f = os.popen(cmd)
line = f.readline()
while line:
if line.find('Subtitles options:') >= 0:
hasKate = True
line = f.readline()
f.close()
p = subprocess.Popen([ffmpeg2theora, '--help'], shell=False, stdout=subprocess.PIPE, close_fds=True)
data, err = p.communicate()
if 'Subtitles options:' in data:
hasKate = True
return hasKate
def probe_iconv(ffmpeg2theora):
hasIconv = False
p = subprocess.Popen([ffmpeg2theora, '--help'], shell=False, stdout=subprocess.PIPE, close_fds=True)
data, err = p.communicate()
if 'supported are all encodings supported by iconv' in data:
hasIconv = True
return hasIconv
def timestr(seconds):
hours = int(seconds/3600)
minutes = int((seconds-( hours*3600 ))/60)
......@@ -175,4 +179,5 @@ def fileInfo(filename):
ffmpeg2theora = probe_ffmpeg2theora()
hasKate = probe_kate(ffmpeg2theora)
hasIconv = probe_iconv(ffmpeg2theora)
......@@ -1637,7 +1637,11 @@ void print_usage() {
"Subtitles options:\n"
" --subtitles file use subtitles from the given file (SubRip (.srt) format)\n"
" --subtitles-encoding encoding set encoding of the subtitles file\n"
#ifdef HAVE_ICONV
" supported are all encodings supported by iconv (see iconv help for list)\n"
#else
" supported are " SUPPORTED_ENCODINGS "\n"
#endif
" --subtitles-language language set subtitles language (de, en_GB, etc)\n"
" --subtitles-category category set subtitles category (default \"subtitles\")\n"
" --subtitles-ignore-non-utf8 ignores any non UTF-8 sequence in UTF-8 text\n"
......@@ -1900,11 +1904,12 @@ int main(int argc, char **argv) {
info.with_kate=1;
break;
case SUBTITLES_ENCODING_FLAG:
if (!strcasecmp(optarg,"utf-8")) set_subtitles_encoding(convert,ENC_UTF8);
else if (!strcasecmp(optarg,"utf8")) set_subtitles_encoding(convert,ENC_UTF8);
else if (!strcasecmp(optarg,"iso-8859-1")) set_subtitles_encoding(convert,ENC_ISO_8859_1);
else if (!strcasecmp(optarg,"latin1")) set_subtitles_encoding(convert,ENC_ISO_8859_1);
else report_unknown_subtitle_encoding(optarg, info.frontend);
if (is_valid_encoding(optarg)) {
set_subtitles_encoding(convert,optarg);
}
else {
report_unknown_subtitle_encoding(optarg, info.frontend);
}
flag = -1;
break;
case SUBTITLES_IGNORE_NON_UTF8_FLAG:
......
......@@ -3,13 +3,6 @@
#include "subtitles.h"
typedef enum {
ENC_UNSET,
ENC_UTF8,
ENC_ISO_8859_1,
} F2T_ENCODING;
typedef struct ff2theora_subtitle{
char *text;
size_t len;
......@@ -28,7 +21,7 @@ typedef struct ff2theora_kate_stream{
/* this block valid for all subtitle sources */
size_t subtitles_count; /* total subtitles output so far */
F2T_ENCODING subtitles_encoding;
char *subtitles_encoding;
char subtitles_language[16];
char subtitles_category[16];
} ff2theora_kate_stream;
......
......@@ -27,6 +27,9 @@
#include <math.h>
#include <errno.h>
#include <stdarg.h>
#ifdef HAVE_ICONV
#include "iconv.h"
#endif
#include "libavformat/avformat.h"
......@@ -58,6 +61,27 @@ static void warn(FILE *frontend, const char *file, unsigned int line, const char
va_end(ap);
}
/**
* checks whether we support the encoding
*/
int is_valid_encoding(const char *encoding)
{
#ifdef HAVE_ICONV
iconv_t cd = iconv_open("UTF-8", encoding);
if (cd != (iconv_t)-1) {
iconv_close(cd);
return 1;
}
return 0;
#else
if (!strcasecmp(encoding, "UTF-8")) return 1;
if (!strcasecmp(encoding, "UTF8")) return 1;
if (!strcasecmp(encoding, "iso-8859-1")) return 1;
if (!strcasecmp(encoding, "latin1")) return 1;
return 0;
#endif
}
/**
* adds a new kate stream structure
*/
......@@ -70,7 +94,7 @@ void add_kate_stream(ff2theora this){
ks->subtitles = 0;
ks->stream_index = -1;
ks->subtitles_count = 0; /* denotes not set yet */
ks->subtitles_encoding = ENC_UNSET;
ks->subtitles_encoding = NULL;
strcpy(ks->subtitles_language, "");
strcpy(ks->subtitles_category, "");
}
......@@ -136,13 +160,13 @@ void set_subtitles_category(ff2theora this,const char *category){
/**
* sets the encoding of the next subtitles file
*/
void set_subtitles_encoding(ff2theora this,F2T_ENCODING encoding){
void set_subtitles_encoding(ff2theora this,const char *encoding){
size_t n;
for (n=0; n<this->n_kate_streams;++n) {
if (this->kate_streams[n].stream_index==-1 && this->kate_streams[n].subtitles_encoding==ENC_UNSET) break;
if (this->kate_streams[n].stream_index==-1 && !this->kate_streams[n].subtitles_encoding) break;
}
if (n==this->n_kate_streams) add_kate_stream(this);
this->kate_streams[n].subtitles_encoding = encoding;
this->kate_streams[n].subtitles_encoding = strdup(encoding);
}
......@@ -175,23 +199,27 @@ static double hmsms2s(int h,int m,int s,int ms)
}
/* very simple implementation when no iconv */
static char *convert_subtitle_to_utf8(F2T_ENCODING encoding,char *text,int ignore_non_utf8, FILE *frontend)
static char *convert_subtitle_to_utf8(const char *encoding,char *text,int ignore_non_utf8, FILE *frontend)
{
size_t nbytes;
char *ptr;
char *newtext = NULL;
int errors=0;
#ifdef HAVE_ICONV
iconv_t cd;
#endif
if (!text) return NULL;
switch (encoding) {
case ENC_UNSET:
/* we don't know what encoding this is, assume UTF-8 and we'll yell if it ain't */
/* fall through */
case ENC_UTF8:
if (encoding == NULL) {
/* we don't know what encoding this is, assume UTF-8 and we'll yell if it ain't */
encoding = "UTF-8";
}
if (!strcasecmp(encoding, "UTF-8") || !strcasecmp(encoding, "UTF8")) {
/* nothing to do, already in UTF-8 */
if (ignore_non_utf8) {
/* actually, give the user the option of just ignoring non UTF8 characters */
/* actually, give the user the option of just ignoring non UTF-8 characters */
char *wptr;
size_t wlen0;
......@@ -231,8 +259,40 @@ static char *convert_subtitle_to_utf8(F2T_ENCODING encoding,char *text,int ignor
else {
newtext = strdup(text);
}
break;
case ENC_ISO_8859_1:
return newtext;
}
/* now, we can either use iconv, or convert ISO-8859-1 by hand (so to speak) */
#ifdef HAVE_ICONV
/* create a conversion for each string, it avoids having to pass around this descriptor,
and the speed hit will be irrelevant anyway compared to video decoding/encoding.
that's fine, because we don't need to keep state across subtitles. */
cd = iconv_open("UTF-8", encoding);
if (cd != (iconv_t)-1) {
/* iconv doesn't seem to have a mode to do a dummy convert to just return the number
of bytes needed, so we just allocate 6 times the number of bytes in the string,
which should be the max we need for UTF-8 */
size_t insz=strlen(text)+1;
size_t outsz = insz*6;
char *inptr = text, *outptr;
newtext = (char*)malloc(outsz);
if (!newtext) {
warn(frontend, NULL, 0, "Memory allocation failed - cannot convert text\n");
iconv_close(cd);
return NULL;
}
outptr=newtext;
if (iconv(cd, &inptr, &insz, &outptr, &outsz) < 0) {
warn(frontend, NULL, 0, "Failed to convert text to UTF-8\n");
free(newtext);
newtext = NULL;
}
iconv_close(cd);
}
#else
if (!strcasecmp(encoding, "iso-8859-1") || !strcasecmp(encoding, "latin1")) {
/* simple, characters above 0x7f are broken in two,
and code points map to the iso-8859-1 8 bit codes */
nbytes=0;
......@@ -256,11 +316,11 @@ static char *convert_subtitle_to_utf8(F2T_ENCODING encoding,char *text,int ignor
}
}
newtext[nbytes++]=0;
break;
default:
}
#endif
else {
warn(frontend, NULL, 0, "encoding %d not handled in conversion!", encoding);
newtext = strdup("");
break;
}
return newtext;
}
......@@ -357,7 +417,7 @@ int load_subtitles(ff2theora_kate_stream *this, int ignore_non_utf8, FILE *front
/* we have all the lines for that subtitle, remove the last \n */
remove_last_newline(text);
/* we want all text to be UTF8 */
/* we want all text to be UTF-8 */
utf8=convert_subtitle_to_utf8(this->subtitles_encoding,text,ignore_non_utf8, frontend);
if (!utf8) {
warn(frontend, this->filename, line, "Failed to get UTF-8 text");
......@@ -479,6 +539,7 @@ void free_subtitles(ff2theora this)
ff2theora_kate_stream *ks=this->kate_streams+i;
for (n=0; n<ks->num_subtitles; ++n) free(ks->subtitles[n].text);
free(ks->subtitles);
free(ks->subtitles_encoding);
}
free(this->kate_streams);
}
......
......@@ -15,6 +15,7 @@
#define SUPPORTED_ENCODINGS "utf-8, utf8, iso-8859-1, latin1"
extern int is_valid_encoding(const char *encoding);
extern void add_kate_stream(ff2theora this);
extern int load_subtitles(ff2theora_kate_stream *this, int ignore_non_utf8, FILE *frontend);
extern void free_subtitles(ff2theora this);
......@@ -24,7 +25,7 @@ extern int add_subtitle_for_stream(ff2theora_kate_stream *streams, int nstreams,
extern void set_subtitles_file(ff2theora this,const char *filename);
extern void set_subtitles_language(ff2theora this,const char *language);
extern void set_subtitles_category(ff2theora this,const char *category);
extern void set_subtitles_encoding(ff2theora this,F2T_ENCODING encoding);
extern void set_subtitles_encoding(ff2theora this,const char *encoding);
extern void report_unknown_subtitle_encoding(const char *name, FILE *frontend);
#endif
......
......@@ -14,6 +14,8 @@ Subtitles are read from SubRip (.srt) format files and converted to
Kate streams. Those SubRip files must be encoded in UTF-8 (7 bit ASCII
is a subset of UTF-8 so is valid input as well). See below for more
information on converting SubRip files with other encodings to UTF-8.
ffmpeg2theora can convert files to UTF-8 transparently if build with
a C library that supports iconv.
Subtitles support requires libkate, available from:
http://code.google.com/p/libkate
......@@ -79,6 +81,8 @@ Use --nosubtitles if those are not to be converted.
* Converting non-UTF-8 files to UTF-8
If ffmpeg2theora wasn't build with iconv support, only UTF-8 and latin1
input text is supported.
If you have SubRip files in another format than UTF-8, you can use the
iconv or recode programs to convert them to UTF-8 so ffmpeg2theora can
read them.
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment