00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #include "kcharsets.h"
00020
00021 #include "kentities.c"
00022
00023 #include <kapplication.h>
00024 #include <kglobal.h>
00025 #include <klocale.h>
00026 #include <kconfig.h>
00027
00028 #include <qfontinfo.h>
00029 #include <qstrlist.h>
00030 #include <qfontdatabase.h>
00031 #include <kdebug.h>
00032
00033 #include <qtextcodec.h>
00034 #include <qmap.h>
00035 #include <qcstring.h>
00036
00037 #include <assert.h>
00038
00039 #define CHARSETS_COUNT 33
00040
00041 static const char * const language_names[] = {
00042 I18N_NOOP( "Other" ),
00043 I18N_NOOP( "Arabic" ),
00044 I18N_NOOP( "Baltic" ),
00045 I18N_NOOP( "Central European" ),
00046 I18N_NOOP( "Chinese Simplified" ),
00047 I18N_NOOP( "Chinese Traditional" ),
00048 I18N_NOOP( "Cyrillic" ),
00049 I18N_NOOP( "Greek" ),
00050 I18N_NOOP( "Hebrew" ),
00051 I18N_NOOP( "Japanese" ),
00052 I18N_NOOP( "Korean" ),
00053 I18N_NOOP( "Thai" ),
00054 I18N_NOOP( "Turkish" ),
00055 I18N_NOOP( "Western European" ),
00056 I18N_NOOP( "Tamil" ),
00057 I18N_NOOP( "Unicode" ),
00058 I18N_NOOP( "Northern Saami" )
00059 };
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069 static const char* const charsets_for_encoding[] = {
00070 "koi8-r", "koi8-r","cp 1251","koi8-u","iso-8859-5", 0,
00071 "koi8-u", "koi8-u","cp 1251","iso-8859-5","koi8-r", 0,
00072 "iso 8859-1", "iso8859-1","iso8859-15", 0,
00073 "iso 8859-2", "iso8859-2","unicode","iso8859-1", 0,
00074 "iso 8859-3", "iso8859-3","unicode","iso8859-1", 0,
00075 "iso 8859-4", "iso8859-4","unicode","iso8859-13", "iso8859-1", 0,
00076 "iso 8859-5", "iso8859-5","koi8-u","koi8-r", 0,
00077 "iso 8859-6", "unicode","iso8859-6", 0,
00078 "iso 8859-7", "iso8859-7", 0,
00079 "iso 8859-8", "iso8859-8", 0,
00080 "iso 8859-8-i", "iso8859-8", 0,
00081 "iso 8859-9", "iso8859-9","unicode","iso8859-1", 0,
00082 "iso 8859-11", "iso8859-11", 0,
00083 "iso 8859-13", "iso8859-13","unicode","iso8859-4", "iso8859-1", 0,
00084 "iso 8859-15", "iso8859-15","unicode","iso8859-1", 0,
00085 "utf8", "unicode","iso8859-1", 0,
00086 "utf16", "unicode","iso8859-1", 0,
00087 "iso-10646-ucs-2", "unicode","iso8859-1", 0,
00088 "cp 1250", "iso8859-2", 0,
00089 "cp 1251", "cp 1251","koi8-u","koi8-r","iso8859-5", 0,
00090 "cp 1252", "iso8859-1", 0,
00091 "cp 1253", "iso8859-7", 0,
00092 "cp 1254", "iso8859-9", 0,
00093 "cp 1255", "iso8859-8", 0,
00094 "cp 1256", "unicode","iso8859-6", 0,
00095 "cp 1257", "iso8859-13", "iso8859-4", 0,
00096 "ibm850", "ibm850","unicode","iso8859-1", 0,
00097 "ibm852", "unicode","iso-8859-2", 0,
00098 "ibm866", "ibm866","cp 1251","koi8-u","koi8-r","iso8859-5", 0,
00099 "tis620", "iso8859-11", 0,
00100 "eucjp", "eucjp","unicode","iso8859-1", 0,
00101 "sjis", "eucjp","unicode","iso8859-1", 0,
00102 "jis7", "eucjp","unicode","iso8859-1", 0,
00103 "big5", "big5","unicode","iso8859-1", 0,
00104 "gbk", "gb2312.1980-0","gbk-0","unicode","iso8859-1", 0,
00105 "gb18030", "gb18030.2000-1", "gb18030.2000-0", "unicode", "gbk-0", "gb2313.1980-0", "iso8859-1", 0,
00106 "gb2312", "gb2312.1980-0","unicode","iso8859-1", 0,
00107 "euckr", "euckr","unicode","iso8859-1", 0,
00108 "tscii", "tscii", 0,
00109 "pt 154", "pt 154","cp 1251","koi8-u","koi8-r","iso8859-5", 0,
00110 "winsami2", "winsami2", "cp1252", "unicode", 0,
00111 0 };
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131 static struct LanguageForEncoding
00132 {
00133 const char* index;
00134 int data;
00135 } const language_for_encoding[] = {
00136 { "iso 8859-1", 13 },
00137 { "iso 8859-15", 13 },
00138 { "cp 1252", 13 },
00139 { "ibm850", 13 },
00140 { "iso 8859-2", 3 },
00141 { "iso 8859-3", 3 },
00142 { "iso 8859-4", 2 },
00143 { "iso 8859-13", 2 },
00144 { "cp 1250", 3 },
00145 { "cp 1254", 12 },
00146 { "cp 1257", 2 },
00147 { "ibm852", 3 },
00148 { "koi8-r", 6 },
00149 { "iso 8859-5", 6 },
00150 { "cp 1251", 6 },
00151 { "koi8-u", 6 },
00152 { "pt 154", 6 },
00153 { "ibm866", 6 },
00154 { "big5", 5 },
00155 { "gb18030", 4 },
00156 { "gbk", 4 },
00157 { "gb2312", 4 },
00158 { "euckr", 10 },
00159 { "sjis", 9 },
00160 { "jis7", 9 },
00161 { "eucjp", 9 },
00162 { "iso 8859-7", 7 },
00163 { "cp 1253", 7 },
00164 { "iso 8859-6", 1 },
00165 { "cp 1256", 1 },
00166 { "iso 8859-8", 8 },
00167 { "iso 8859-8-i", 8 },
00168 { "cp 1255", 8 },
00169 { "iso 8859-9", 12 },
00170 { "tis620", 11 },
00171 { "iso 8859-11", 11 },
00172 { "utf8", 15 },
00173 { "utf16", 15 },
00174 { "utf7", 15 },
00175 { "ucs2", 15 },
00176 { "iso-10646-ucs-2", 15 },
00177 { "winsami2", 16},
00178 { 0, 0 } };
00179
00180
00181 static struct Builtin
00182 {
00183 const char* index;
00184 const char* data;
00185 } const builtin[] = {
00186 { "iso-ir-111", "koi8-r" },
00187 { "koi8-ru", "koi8-u" },
00188 { "koi8r", "koi8-r" },
00189 { "koi8u", "koi8-u" },
00190 { "koi unified", "koi8-r" },
00191 { "us-ascii", "iso 8859-1" },
00192 { "usascii", "iso 8859-1" },
00193 { "x-utf-8", "utf-8" },
00194 { "x-utf-7", "utf-7" },
00195 { "unicode-1-1-utf-7", "utf-7" },
00196 { "utf-16", "iso-10646-ucs-2" },
00197 { "utf16", "iso-10646-ucs-2" },
00198 { "ucs2", "iso-10646-ucs-2" },
00199 { "iso10646-1", "iso-10646-ucs-2" },
00200 { "gb18030.2000-1", "gb18030" },
00201 { "gb18030.2000-0", "gb18030" },
00202 { "gbk-0", "gbk" },
00203 { "gb2312", "gbk" },
00204 { "gb2312.1980-0", "gbk" },
00205 { "big5-0", "big5" },
00206 { "euc-kr", "euckr" },
00207 { "x-euc-kr", "euckr" },
00208 { "euc-jp", "eucjp" },
00209 { "x-euc-jp", "eucjp" },
00210 { "jisx0201.1976-0", "eucjp" },
00211 { "jisx0208.1983-0", "eucjp" },
00212 { "jisx0208.1990-0", "eucjp" },
00213 { "jisx0208.1997-0", "eucjp" },
00214 { "jisx0212.1990-0", "eucjp" },
00215 { "jisx0213.2000-1", "eucjp" },
00216 { "jisx0213.2000-2", "eucjp" },
00217 { "shift_jis", "sjis" },
00218 { "shift-jis", "sjis" },
00219 { "x-sjis", "sjis" },
00220 { "iso-2022-jp", "jis7" },
00221 { "windows850", "ibm850" },
00222 { "windows866", "ibm866" },
00223 { "windows1251", "cp 1251" },
00224 { "windows1252", "cp 1252" },
00225 { "windows1253", "cp 1253" },
00226 { "windows1254", "cp 1254" },
00227 { "windows1255", "cp 1255" },
00228 { "windows1256", "cp 1256" },
00229 { "windows1257", "cp 1257" },
00230 { "windows-850", "ibm850" },
00231 { "windows-866", "ibm866" },
00232 { "windows-1250", "cp 1250" },
00233 { "windows-1251", "cp 1251" },
00234 { "windows-1252", "cp 1252" },
00235 { "windows-1253", "cp 1253" },
00236 { "windows-1254", "cp 1254" },
00237 { "windows-1255", "cp 1255" },
00238 { "windows-1256", "cp 1256" },
00239 { "windows-1257", "cp 1257" },
00240 { "x-windows-850", "ibm850" },
00241 { "x-windows-866", "ibm866" },
00242 { "x-windows-1250", "cp 1250" },
00243 { "x-windows-1251", "cp 1251" },
00244 { "x-windows-1252", "cp 1252" },
00245 { "x-windows-1253", "cp 1253" },
00246 { "x-windows-1254", "cp 1254" },
00247 { "x-windows-1255", "cp 1255" },
00248 { "x-windows-1256", "cp 1256" },
00249 { "x-windows-1257", "cp 1257" },
00250 { "cp850", "ibm850" },
00251 { "cp866", "ibm866" },
00252 { "cp-850", "ibm850" },
00253 { "cp-866", "ibm866" },
00254 { "cp-1250", "cp 1250" },
00255 { "cp-1251", "cp 1251" },
00256 { "cp-1252", "cp 1252" },
00257 { "cp-1253", "cp 1253" },
00258 { "cp-1254", "cp 1254" },
00259 { "cp-1255", "cp 1255" },
00260 { "cp-1256", "cp 1256" },
00261 { "cp-1257", "cp 1257" },
00262 { "cp-10000", "apple roman" },
00263 { "x-cp-850", "ibm850" },
00264 { "x-cp-866", "ibm866" },
00265 { "x-cp-1250", "cp 1250" },
00266 { "x-cp-1251", "cp 1251" },
00267 { "x-cp-1252", "cp 1252" },
00268 { "x-cp-1253", "cp 1253" },
00269 { "x-cp-1254", "cp 1254" },
00270 { "x-cp-1255", "cp 1255" },
00271 { "x-cp-1256", "cp 1256" },
00272 { "x-cp-1257", "cp 1257" },
00273 { "x-cp-10000", "apple roman" },
00274 { "tis620", "iso 8859-11" },
00275 { "tis-620", "iso 8859-11" },
00276 { "thai-tis620", "iso 8859-11" },
00277 { "windows-874", "iso 8859-11" },
00278 { "windows874", "iso 8859-11" },
00279 { "x-windows-874", "iso 8859-11" },
00280 { "cp874", "iso 8859-11" },
00281 { "cp-874", "iso 8859-11" },
00282 { "x-cp-874", "iso 8859-11" },
00283 { "ksc5601.1987-0", "euckr" },
00284 { "ks_c_5601-1987", "euckr" },
00285 { "iso-8859-1", "iso 8859-1" },
00286 { "iso-8859-2", "iso 8859-2" },
00287 { "iso-8859-3", "iso 8859-3" },
00288 { "iso-8859-4", "iso 8859-4" },
00289 { "iso-8859-5", "iso 8859-5" },
00290 { "iso-8859-6", "iso 8859-6" },
00291 { "iso-8859-7", "iso 8859-7" },
00292 { "iso-8859-8", "iso 8859-8" },
00293 { "iso-8859-9", "iso 8859-9" },
00294 { "iso-8859-10", "iso 8859-10" },
00295 { "iso-8859-11", "iso 8859-11" },
00296 { "iso-8859-12", "iso 8859-12" },
00297 { "iso-8859-13", "iso 8859-13" },
00298 { "iso-8859-14", "iso 8859-14" },
00299 { "iso-8859-15", "iso 8859-15" },
00300 { "tscii", "tscii" },
00301 { "paratype-154", "pt 154" },
00302 { "pt-154", "pt 154" },
00303 { "x-winsami2", "winsami2" },
00304 { "x-mac-roman", "apple roman" },
00305 { "macintosh", "apple roman" },
00306 { "mac", "apple roman" },
00307 { 0, 0 }};
00308
00309
00310
00311 static struct Aliases
00312 {
00313 const char* index;
00314 const char* data;
00315 } const aliases[] = {
00316 { "cp852", "ibm852" },
00317 { "cp-852", "ibm852" },
00318 { "x-cp-852", "ibm852" },
00319 { "windows852", "ibm852" },
00320 { "windows-852", "ibm852" },
00321 { "x-windows-852", "ibm852" },
00322 { 0, 0 }};
00323
00324
00325
00326
00327 static struct ConversionHints
00328 {
00329 const char* index;
00330 const char* data;
00331 } const conversion_hints[] = {
00332 { "cp1250", "iso-8859-2" },
00333 { "koi8-r", "iso-8859-5" },
00334 { "koi8-u", "koi8-r" },
00335 { 0, 0 }};
00336
00337
00338
00339
00340 template< typename T, typename Data >
00341 static Data kcharsets_array_search( const T* start, const char* entry )
00342 {
00343 for( const T* pos = start;
00344 pos->index != 0;
00345 ++pos )
00346 if( qstrcmp( pos->index, entry ) == 0 )
00347 return pos->data;
00348 return 0;
00349 }
00350
00351
00352 class KCharsetsPrivate
00353 {
00354 public:
00355 KCharsetsPrivate(KCharsets* _kc)
00356 : codecForNameDict(43, false)
00357 {
00358 db = 0;
00359 kc = _kc;
00360 }
00361 ~KCharsetsPrivate()
00362 {
00363 delete db;
00364 }
00365 QFontDatabase *db;
00366 QAsciiDict<QTextCodec> codecForNameDict;
00367 KCharsets* kc;
00368 };
00369
00370
00371
00372 KCharsets::KCharsets()
00373 {
00374 d = new KCharsetsPrivate(this);
00375 }
00376
00377 KCharsets::~KCharsets()
00378 {
00379 delete d;
00380 }
00381
00382 QChar KCharsets::fromEntity(const QString &str)
00383 {
00384 QChar res = QChar::null;
00385
00386 int pos = 0;
00387 if(str[pos] == '&') pos++;
00388
00389
00390 if (str[pos] == '#' && str.length()-pos > 1) {
00391 bool ok;
00392 pos++;
00393 if (str[pos] == 'x' || str[pos] == 'X') {
00394 pos++;
00395
00396 QString tmp(str.unicode()+pos, str.length()-pos);
00397 res = tmp.toInt(&ok, 16);
00398 } else {
00399
00400 QString tmp(str.unicode()+pos, str.length()-pos);
00401 res = tmp.toInt(&ok, 10);
00402 }
00403 return res;
00404 }
00405
00406 const entity *e = kde_findEntity(str.ascii(), str.length());
00407
00408 if(!e)
00409 {
00410
00411 return QChar::null;
00412 }
00413
00414
00415 return QChar(e->code);
00416 }
00417
00418 QChar KCharsets::fromEntity(const QString &str, int &len)
00419 {
00420
00421
00422 len = 8;
00423 while(len > 0)
00424 {
00425 QString tmp = str.left(len);
00426 QChar res = fromEntity(tmp);
00427 if( res != QChar::null ) return res;
00428 len--;
00429 }
00430 return QChar::null;
00431 }
00432
00433
00434 QString KCharsets::toEntity(const QChar &ch)
00435 {
00436 QString ent;
00437 ent.sprintf("�x%x;", ch.unicode());
00438 return ent;
00439 }
00440
00441 QString KCharsets::resolveEntities( const QString &input )
00442 {
00443 QString text = input;
00444 const QChar *p = text.unicode();
00445 const QChar *end = p + text.length();
00446 const QChar *ampersand = 0;
00447 bool scanForSemicolon = false;
00448
00449 for ( ; p < end; ++p ) {
00450 const QChar ch = *p;
00451
00452 if ( ch == '&' ) {
00453 ampersand = p;
00454 scanForSemicolon = true;
00455 continue;
00456 }
00457
00458 if ( ch != ';' || scanForSemicolon == false )
00459 continue;
00460
00461 assert( ampersand );
00462
00463 scanForSemicolon = false;
00464
00465 const QChar *entityBegin = ampersand + 1;
00466
00467 const uint entityLength = p - entityBegin;
00468 if ( entityLength == 0 )
00469 continue;
00470
00471 const QChar entityValue = KCharsets::fromEntity( QConstString( entityBegin, entityLength ).string() );
00472 if ( entityValue.isNull() )
00473 continue;
00474
00475 const uint ampersandPos = ampersand - text.unicode();
00476
00477 text[ (int)ampersandPos ] = entityValue;
00478 text.remove( ampersandPos + 1, entityLength + 1 );
00479 p = text.unicode() + ampersandPos;
00480 end = text.unicode() + text.length();
00481 ampersand = 0;
00482 }
00483
00484 return text;
00485 }
00486
00487 QStringList KCharsets::availableEncodingNames()
00488 {
00489 QStringList available;
00490
00491 const char* const* pos = charsets_for_encoding;
00492 while( *pos != 0 ) {
00493
00494
00495
00496
00497 for( const char* const* charsets = pos + 1;
00498 *charsets != 0;
00499 ++charsets ) {
00500
00501 #ifdef __GNUC__
00502 #warning FIXME?
00503 #endif
00504 if( true ) {
00505
00506 available.append( QString::fromLatin1( *pos ));
00507 break;
00508 }
00509 }
00510 while( *pos != 0 )
00511 ++pos;
00512 ++pos;
00513 }
00514 return available;
00515 }
00516
00517 QString KCharsets::languageForEncoding( const QString &encoding )
00518 {
00519 int lang = kcharsets_array_search< LanguageForEncoding, int >
00520 ( language_for_encoding, encoding.latin1());
00521 return i18n( language_names[lang] );
00522 }
00523
00524 QString KCharsets::encodingForName( const QString &descriptiveName )
00525 {
00526 const int left = descriptiveName.findRev( '(' );
00527
00528 if (left<0)
00529 return descriptiveName.stripWhiteSpace();
00530
00531 QString name(descriptiveName.mid(left+1));
00532
00533 const int right = name.findRev( ')' );
00534
00535 if (right<0)
00536 return name;
00537
00538 return name.left(right).stripWhiteSpace();
00539 }
00540
00541 QStringList KCharsets::descriptiveEncodingNames()
00542 {
00543 QStringList encodings = availableEncodingNames();
00544 QStringList::Iterator it;
00545 for( it = encodings.begin(); it != encodings.end(); ++it ) {
00546 QString lang = KGlobal::charsets()->languageForEncoding( *it );
00547 *it = i18n("Descriptive Encoding Name", "%1 ( %2 )") .arg(lang) .arg(*it);
00548 }
00549 encodings.sort();
00550 return encodings;
00551 }
00552
00553 QTextCodec *KCharsets::codecForName(const QString &n) const
00554 {
00555 bool b;
00556 return codecForName( n, b );
00557 }
00558
00559 QTextCodec *KCharsets::codecForName(const QString &n, bool &ok) const
00560 {
00561 ok = true;
00562
00563 QTextCodec* codec = 0;
00564
00565 if((codec = d->codecForNameDict[n.isEmpty() ? "->locale<-" : n.latin1()]))
00566 return codec;
00567
00568 if (n.isEmpty()) {
00569 codec = KGlobal::locale()->codecForEncoding();
00570 d->codecForNameDict.replace("->locale<-", codec);
00571 return codec;
00572 }
00573
00574 QCString name = n.lower().latin1();
00575 QCString key = name;
00576 if (name.right(8) == "_charset")
00577 name.truncate(name.length()-8);
00578
00579 if (name.isEmpty()) {
00580 ok = false;
00581 return QTextCodec::codecForName("iso8859-1");
00582 }
00583
00584 codec = QTextCodec::codecForName(name);
00585
00586 if(codec) {
00587 d->codecForNameDict.replace(key, codec);
00588 return codec;
00589 }
00590
00591
00592
00593 QCString cname = kcharsets_array_search< Builtin, const char* >( builtin, name.data());
00594
00595 if(!cname.isEmpty())
00596 codec = QTextCodec::codecForName(cname);
00597
00598 if(codec)
00599 {
00600 d->codecForNameDict.replace(key, codec);
00601 return codec;
00602 }
00603
00604
00605 QString dir;
00606 {
00607 KConfigGroupSaver cfgsav( KGlobal::config(), "i18n" );
00608 dir = KGlobal::config()->readPathEntry("i18ndir", QString::fromLatin1("/usr/share/i18n/charmaps"));
00609 dir += "/";
00610 }
00611
00612
00613
00614 cname = kcharsets_array_search< Aliases, const char* >( aliases, name.data());
00615
00616 if(cname.isEmpty())
00617 cname = name;
00618 cname = cname.upper();
00619
00620 codec = QTextCodec::loadCharmapFile((QString)(dir + cname.data()));
00621
00622 if(codec) {
00623 d->codecForNameDict.replace(key, codec);
00624 return codec;
00625 }
00626
00627
00628
00629 cname = cname.lower();
00630 cname = kcharsets_array_search< ConversionHints, const char* >( conversion_hints, (const char*)cname );
00631
00632 if(!cname.isEmpty())
00633 codec = QTextCodec::codecForName(cname);
00634
00635 if(codec) {
00636 d->codecForNameDict.replace(key, codec);
00637 return codec;
00638 }
00639
00640
00641 ok = false;
00642 return QTextCodec::codecForName("iso8859-1");
00643 }