diff options
author | Werner Koch <wk@gnupg.org> | 2006-09-22 20:15:18 +0200 |
---|---|---|
committer | Werner Koch <wk@gnupg.org> | 2006-09-22 20:15:18 +0200 |
commit | 2db8df0ba388f0a39424b76ce00fe5b51abbd54d (patch) | |
tree | 20ec3eea82409a6329e241b92c9c16ef07d07456 /jnlib | |
parent | Improved yat2m. (diff) | |
download | gnupg2-2db8df0ba388f0a39424b76ce00fe5b51abbd54d.tar.xz gnupg2-2db8df0ba388f0a39424b76ce00fe5b51abbd54d.zip |
Added iconv support and doc cleanups.
Diffstat (limited to 'jnlib')
-rw-r--r-- | jnlib/ChangeLog | 6 | ||||
-rw-r--r-- | jnlib/utf8conv.c | 572 |
2 files changed, 383 insertions, 195 deletions
diff --git a/jnlib/ChangeLog b/jnlib/ChangeLog index f601d9b14..6c38de2b1 100644 --- a/jnlib/ChangeLog +++ b/jnlib/ChangeLog @@ -1,3 +1,9 @@ +2006-09-22 Werner Koch <wk@g10code.com> + + * utf8conv.c: Reworked to match the gnupg 1.4.5 code. This now + requires iconv support but this is reasonable for all modern + systems. + 2006-08-29 Werner Koch <wk@g10code.com> * logging.c (do_logv): Emit a missing LF for fatal errors. diff --git a/jnlib/utf8conv.c b/jnlib/utf8conv.c index 9fba1ed4f..ebb6ef3fd 100644 --- a/jnlib/utf8conv.c +++ b/jnlib/utf8conv.c @@ -28,101 +28,225 @@ #ifdef HAVE_LANGINFO_CODESET #include <langinfo.h> #endif +#include <errno.h> +#include <iconv.h> #include "libjnlib-config.h" #include "stringhelp.h" #include "utf8conv.h" +#ifndef MB_LEN_MAX +#define MB_LEN_MAX 16 +#endif + +static const char *active_charset_name = "iso-8859-1"; +static unsigned short *active_charset; +static int no_translation; /* Set to true if we let simply pass through. */ +static int use_iconv; /* iconv comversion fucntions required. */ -static ushort koi8_unicode[128] = { - 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524, - 0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, - 0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2219, 0x221a, 0x2248, - 0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7, - 0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556, - 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 0x255e, - 0x255f, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565, - 0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x00a9, - 0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, - 0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, - 0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, - 0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a, - 0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, - 0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, - 0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, - 0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a -}; - -static ushort latin2_unicode[128] = { - 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, - 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F, - 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, - 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, - 0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7, - 0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B, - 0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7, - 0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C, - 0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7, - 0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E, - 0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7, - 0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF, - 0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7, - 0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F, - 0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7, - 0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9 -}; -static const char *active_charset_name = "iso-8859-1"; -static ushort *active_charset = NULL; -static int no_translation = 0; +/* Error handler for iconv failures. This is needed to not clutter the + output with repeated diagnostics about a missing conversion. */ +static void +handle_iconv_error (const char *to, const char *from, int use_fallback) +{ + if (errno == EINVAL) + { + static int shown1, shown2; + int x; + + if (to && !strcmp (to, "utf-8")) + { + x = shown1; + shown1 = 1; + } + else + { + x = shown2; + shown2 = 1; + } + + if (!x) + log_info (_("conversion from `%s' to `%s' not available\n"), + from, to); + } + else + { + static int shown; + + if (!shown) + log_info (_("iconv_open failed: %s\n"), strerror (errno)); + shown = 1; + } + + if (use_fallback) + { + /* To avoid further error messages we fallback to Latin-1 for the + native encoding. This is justified as one can expect that on a + utf-8 enabled system nl_langinfo() will work and thus we won't + never get to here. Thus Latin-1 seems to be a reasonable + default. */ + active_charset_name = "iso-8859-1"; + no_translation = 0; + active_charset = NULL; + use_iconv = 0; + } +} + int set_native_charset (const char *newset) { - if (!newset) + const char *full_newset; + + if (!newset) + { +#ifdef HABE_W32_SYSTEM + static char codepage[30]; + unsigned int cpno; + const char *aliases; + + /* We are a console program thus we need to use the + GetConsoleOutputCP function and not the the GetACP which + would give the codepage for a GUI program. Note this is not + a bulletproof detection because GetConsoleCP might return a + different one for console input. Not sure how to cope with + that. If the console Code page is not known we fall back to + the system code page. */ + cpno = GetConsoleOutputCP (); + if (!cpno) + cpno = GetACP (); + sprintf (codepage, "CP%u", cpno ); + /* Resolve alias. We use a long string string and not the usual + array to optimize if the code is taken to a DSO. Taken from + libiconv 1.9.2. */ + newset = codepage; + for (aliases = ("CP936" "\0" "GBK" "\0" + "CP1361" "\0" "JOHAB" "\0" + "CP20127" "\0" "ASCII" "\0" + "CP20866" "\0" "KOI8-R" "\0" + "CP21866" "\0" "KOI8-RU" "\0" + "CP28591" "\0" "ISO-8859-1" "\0" + "CP28592" "\0" "ISO-8859-2" "\0" + "CP28593" "\0" "ISO-8859-3" "\0" + "CP28594" "\0" "ISO-8859-4" "\0" + "CP28595" "\0" "ISO-8859-5" "\0" + "CP28596" "\0" "ISO-8859-6" "\0" + "CP28597" "\0" "ISO-8859-7" "\0" + "CP28598" "\0" "ISO-8859-8" "\0" + "CP28599" "\0" "ISO-8859-9" "\0" + "CP28605" "\0" "ISO-8859-15" "\0" + "CP65001" "\0" "UTF-8" "\0"); + *aliases; + aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) + { + if (!strcmp (codepage, aliases) ||(*aliases == '*' && !aliases[1])) + { + newset = aliases + strlen (aliases) + 1; + break; + } + } + +#else /*!HAVE_W32_SYSTEM*/ + #ifdef HAVE_LANGINFO_CODESET - newset = nl_langinfo (CODESET); -#else - newset = "8859-1"; -#endif + newset = nl_langinfo (CODESET); +#else /*!HAVE_LANGINFO_CODESET*/ + /* Try to get the used charset from environment variables. */ + static char codepage[30]; + const char *lc, *dot, *mod; + + strcpy (codepage, "iso-8859-1"); + lc = getenv ("LC_ALL"); + if (!lc || !*lc) + { + lc = getenv ("LC_CTYPE"); + if (!lc || !*lc) + lc = getenv ("LANG"); + } + if (lc && *lc) + { + dot = strchr (lc, '.'); + if (dot) + { + mod = strchr (++dot, '@'); + if (!mod) + mod = dot + strlen (dot); + if (mod - dot < sizeof codepage && dot != mod) + { + memcpy (codepage, dot, mod - dot); + codepage [mod - dot] = 0; + } + } + } + newset = codepage; +#endif /*!HAVE_LANGINFO_CODESET*/ +#endif /*!HAVE_W32_SYSTEM*/ + } + full_newset = newset; if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3)) { newset += 3; if (*newset == '-' || *newset == '_') - newset++; + newset++; } - if (!*newset - || !ascii_strcasecmp (newset, "8859-1") - || !ascii_strcasecmp (newset, "8859-15")) + /* Note that we silently assume that plain ASCII is actually meant + as Latin-1. This makes sense because many Unix system don't have + their locale set up properly and thus would get annoying error + messages and we have to handle all the "bug" reports. Latin-1 has + always been the character set used for 8 bit characters on Unix + systems. */ + if ( !*newset + || !ascii_strcasecmp (newset, "8859-1" ) + || !ascii_strcasecmp (newset, "646" ) + || !ascii_strcasecmp (newset, "ASCII" ) + || !ascii_strcasecmp (newset, "ANSI_X3.4-1968" ) + ) { active_charset_name = "iso-8859-1"; no_translation = 0; active_charset = NULL; + use_iconv = 0; } - else if (!ascii_strcasecmp (newset, "8859-2")) - { - active_charset_name = "iso-8859-2"; - no_translation = 0; - active_charset = latin2_unicode; - } - else if (!ascii_strcasecmp (newset, "koi8-r")) - { - active_charset_name = "koi8-r"; - no_translation = 0; - active_charset = koi8_unicode; - } - else if (!ascii_strcasecmp (newset, "utf8") - || !ascii_strcasecmp (newset, "utf-8")) + else if ( !ascii_strcasecmp (newset, "utf8" ) + || !ascii_strcasecmp(newset, "utf-8") ) { active_charset_name = "utf-8"; no_translation = 1; active_charset = NULL; + use_iconv = 0; } else - return -1; + { + iconv_t cd; + +#ifdef HAVE_W32_SYSTEM + if (load_libiconv ()) + return -1; +#endif /*HAVE_W32_SYSTEM*/ + + cd = iconv_open (full_newset, "utf-8"); + if (cd == (iconv_t)-1) + { + handle_iconv_error (full_newset, "utf-8", 0); + return -1; + } + iconv_close (cd); + cd = iconv_open ("utf-8", full_newset); + if (cd == (iconv_t)-1) + { + handle_iconv_error ("utf-8", full_newset, 0); + return -1; + } + iconv_close (cd); + active_charset_name = full_newset; + no_translation = 0; + active_charset = NULL; + use_iconv = 1; + } return 0; } @@ -132,10 +256,9 @@ get_native_charset () return active_charset_name; } -/**************** - * Convert string, which is in native encoding to UTF8 and return the - * new allocated UTF8 string. - */ + +/* Convert string, which is in native encoding to UTF8 and return a + new allocated UTF-8 string. */ char * native_to_utf8 (const char *orig_string) { @@ -147,41 +270,12 @@ native_to_utf8 (const char *orig_string) if (no_translation) { + /* Already utf-8 encoded. */ buffer = jnlib_xstrdup (orig_string); } - else if (active_charset) - { - for (s = string; *s; s++) - { - length++; - if (*s & 0x80) - length += 2; /* we may need 3 bytes */ - } - buffer = jnlib_xmalloc (length + 1); - for (p = (unsigned char *)buffer, s = string; *s; s++) - { - if ((*s & 0x80)) - { - ushort val = active_charset[*s & 0x7f]; - if (val < 0x0800) - { - *p++ = 0xc0 | ((val >> 6) & 0x1f); - *p++ = 0x80 | (val & 0x3f); - } - else - { - *p++ = 0xe0 | ((val >> 12) & 0x0f); - *p++ = 0x80 | ((val >> 6) & 0x3f); - *p++ = 0x80 | (val & 0x3f); - } - } - else - *p++ = *s; - } - *p = 0; - } - else + else if (!active_charset && !use_iconv) { + /* For Latin-1 we can avoid the iconv overhead. */ for (s = string; *s; s++) { length++; @@ -191,7 +285,7 @@ native_to_utf8 (const char *orig_string) buffer = jnlib_xmalloc (length + 1); for (p = (unsigned char *)buffer, s = string; *s; s++) { - if (*s & 0x80) + if ( (*s & 0x80 )) { *p++ = 0xc0 | ((*s >> 6) & 3); *p++ = 0x80 | (*s & 0x3f); @@ -201,22 +295,68 @@ native_to_utf8 (const char *orig_string) } *p = 0; } + else + { + /* Need to use iconv. */ + iconv_t cd; + const char *inptr; + char *outptr; + size_t inbytes, outbytes; + + cd = iconv_open ("utf-8", active_charset_name); + if (cd == (iconv_t)-1) + { + handle_iconv_error ("utf-8", active_charset_name, 1); + return native_to_utf8 (string); + } + + for (s=string; *s; s++ ) + { + length++; + if ((*s & 0x80)) + length += 5; /* We may need up to 6 bytes for the utf8 output. */ + } + buffer = jnlib_xmalloc (length + 1); + + inptr = string; + inbytes = strlen (string); + outptr = buffer; + outbytes = length; + if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes, + &outptr, &outbytes) == (size_t)-1) + { + static int shown; + + if (!shown) + log_info (_("conversion from `%s' to `%s' failed: %s\n"), + active_charset_name, "utf-8", strerror (errno)); + shown = 1; + /* We don't do any conversion at all but use the strings as is. */ + strcpy (buffer, string); + } + else /* Success. */ + { + *outptr = 0; + /* We could realloc the buffer now but I doubt that it makes + much sense given that it will get freed anyway soon + after. */ + } + iconv_close (cd); + } return buffer; } -/* Convert string, which is in UTF8 to native encoding. Replace - * illegal encodings by some "\xnn" and quote all control - * characters. A character with value DELIM will always be quoted, it - * must be a vanilla ASCII character. */ -char * -utf8_to_native (const char *string, size_t length, int delim) + +static char * +do_utf8_to_native (const char *string, size_t length, int delim, + int with_iconv) { int nleft; int i; unsigned char encbuf[8]; int encidx; - const byte *s; + const unsigned char *s; size_t n; char *buffer = NULL; char *p = NULL; @@ -224,19 +364,20 @@ utf8_to_native (const char *string, size_t length, int delim) size_t slen; int resync = 0; - /* 1. pass (p==NULL): count the extended utf-8 characters */ - /* 2. pass (p!=NULL): create string */ + /* First pass (p==NULL): count the extended utf-8 characters. */ + /* Second pass (p!=NULL): create string. */ for (;;) { for (slen = length, nleft = encidx = 0, n = 0, - s = (const unsigned char *)string; slen; + s = (const unsigned char *)string; + slen; s++, slen--) { if (resync) { if (!(*s < 128 || (*s >= 0xc0 && *s <= 0xfd))) { - /* still invalid */ + /* Still invalid. */ if (p) { sprintf (p, "\\x%02x", *s); @@ -250,45 +391,23 @@ utf8_to_native (const char *string, size_t length, int delim) if (!nleft) { if (!(*s & 0x80)) - { /* plain ascii */ - if (*s < 0x20 || *s == 0x7f || *s == delim || - (delim && *s == '\\')) + { + /* Plain ascii. */ + if ( delim != -1 + && (*s < 0x20 || *s == 0x7f || *s == delim + || (delim && *s == '\\'))) { n++; if (p) *p++ = '\\'; switch (*s) { - case '\n': - n++; - if (p) - *p++ = 'n'; - break; - case '\r': - n++; - if (p) - *p++ = 'r'; - break; - case '\f': - n++; - if (p) - *p++ = 'f'; - break; - case '\v': - n++; - if (p) - *p++ = 'v'; - break; - case '\b': - n++; - if (p) - *p++ = 'b'; - break; - case 0: - n++; - if (p) - *p++ = '0'; - break; + case '\n': n++; if ( p ) *p++ = 'n'; break; + case '\r': n++; if ( p ) *p++ = 'r'; break; + case '\f': n++; if ( p ) *p++ = 'f'; break; + case '\v': n++; if ( p ) *p++ = 'v'; break; + case '\b': n++; if ( p ) *p++ = 'b'; break; + case 0: n++; if ( p ) *p++ = '0'; break; default: n += 3; if (p) @@ -306,43 +425,43 @@ utf8_to_native (const char *string, size_t length, int delim) n++; } } - else if ((*s & 0xe0) == 0xc0) - { /* 110x xxxx */ + else if ((*s & 0xe0) == 0xc0) /* 110x xxxx */ + { val = *s & 0x1f; nleft = 1; encidx = 0; encbuf[encidx++] = *s; } - else if ((*s & 0xf0) == 0xe0) - { /* 1110 xxxx */ + else if ((*s & 0xf0) == 0xe0) /* 1110 xxxx */ + { val = *s & 0x0f; nleft = 2; encidx = 0; encbuf[encidx++] = *s; } - else if ((*s & 0xf8) == 0xf0) - { /* 1111 0xxx */ + else if ((*s & 0xf8) == 0xf0) /* 1111 0xxx */ + { val = *s & 0x07; nleft = 3; encidx = 0; encbuf[encidx++] = *s; } - else if ((*s & 0xfc) == 0xf8) - { /* 1111 10xx */ + else if ((*s & 0xfc) == 0xf8) /* 1111 10xx */ + { val = *s & 0x03; nleft = 4; encidx = 0; encbuf[encidx++] = *s; } - else if ((*s & 0xfe) == 0xfc) - { /* 1111 110x */ + else if ((*s & 0xfe) == 0xfc) /* 1111 110x */ + { val = *s & 0x01; nleft = 5; encidx = 0; encbuf[encidx++] = *s; } - else - { /* invalid encoding: print as \xnn */ + else /* Invalid encoding: print as \xNN. */ + { if (p) { sprintf (p, "\\x%02x", *s); @@ -352,8 +471,8 @@ utf8_to_native (const char *string, size_t length, int delim) resync = 1; } } - else if (*s < 0x80 || *s >= 0xc0) - { /* invalid */ + else if (*s < 0x80 || *s >= 0xc0) /* Invalid utf-8 */ + { if (p) { for (i = 0; i < encidx; i++) @@ -374,8 +493,8 @@ utf8_to_native (const char *string, size_t length, int delim) encbuf[encidx++] = *s; val <<= 6; val |= *s & 0x3f; - if (!--nleft) - { /* ready */ + if (!--nleft) /* Ready. */ + { if (no_translation) { if (p) @@ -386,43 +505,41 @@ utf8_to_native (const char *string, size_t length, int delim) n += encidx; encidx = 0; } - else if (active_charset) - { /* table lookup */ - for (i = 0; i < 128; i++) - { - if (active_charset[i] == val) - break; - } - if (i < 128) - { /* we can print this one */ - if (p) - *p++ = i + 128; - n++; - } - else - { /* we do not have a translation: print utf8 */ - if (p) - { - for (i = 0; i < encidx; i++) - { - sprintf (p, "\\x%02x", encbuf[i]); - p += 4; - } - } - n += encidx * 4; - encidx = 0; - } - } - else - { /* native set */ + else if (with_iconv) + { + /* Our strategy for using iconv is a bit strange + but it better keeps compatibility with + previous versions in regard to how invalid + encodings are displayed. What we do is to + keep the utf-8 as is and have the real + translation step then at the end. Yes, I + know that this is ugly. However we are short + of the 1.4 release and for this branch we + should not mess too much around with iconv + things. One reason for this is that we don't + know enough about non-GNU iconv + implementation and want to minimize the risk + of breaking the code on too many platforms. */ + if ( p ) + { + for (i=0; i < encidx; i++ ) + *p++ = encbuf[i]; + } + n += encidx; + encidx = 0; + } + else /* Latin-1 case. */ + { if (val >= 0x80 && val < 256) { - n++; /* we can simply print this character */ + /* We can simply print this character */ + n++; if (p) *p++ = val; } else - { /* we do not have a translation: print utf8 */ + { + /* We do not have a translation: print utf8. */ if (p) { for (i = 0; i < encidx; i++) @@ -440,13 +557,78 @@ utf8_to_native (const char *string, size_t length, int delim) } } if (!buffer) - { /* allocate the buffer after the first pass */ + { + /* Allocate the buffer after the first pass. */ buffer = p = jnlib_xmalloc (n + 1); } - else + else if (with_iconv) + { + /* Note: See above for comments. */ + iconv_t cd; + const char *inptr; + char *outbuf, *outptr; + size_t inbytes, outbytes; + + *p = 0; /* Terminate the buffer. */ + + cd = iconv_open (active_charset_name, "utf-8"); + if (cd == (iconv_t)-1) + { + handle_iconv_error (active_charset_name, "utf-8", 1); + jnlib_free (buffer); + return utf8_to_native (string, length, delim); + } + + /* Allocate a new buffer large enough to hold all possible + encodings. */ + n = p - buffer + 1; + inbytes = n - 1;; + inptr = buffer; + outbytes = n * MB_LEN_MAX; + if (outbytes / MB_LEN_MAX != n) + BUG (); /* Actually an overflow. */ + outbuf = outptr = jnlib_xmalloc (outbytes); + if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes, + &outptr, &outbytes) == (size_t)-1) + { + static int shown; + + if (!shown) + log_info (_("conversion from `%s' to `%s' failed: %s\n"), + "utf-8", active_charset_name, strerror (errno)); + shown = 1; + /* Didn't worked out. Try again but without iconv. */ + jnlib_free (buffer); + buffer = NULL; + jnlib_free (outbuf); + outbuf = do_utf8_to_native (string, length, delim, 0); + } + else /* Success. */ + { + *outptr = 0; /* Make sure it is a string. */ + /* We could realloc the buffer now but I doubt that it + makes much sense given that it will get freed + anyway soon after. */ + jnlib_free (buffer); + } + iconv_close (cd); + return outbuf; + } + else /* Not using iconv. */ { - *p = 0; /* make a string */ + *p = 0; /* Make sure it is a string. */ return buffer; } } } + +/* Convert string, which is in UTF-8 to native encoding. Replace + illegal encodings by some "\xnn" and quote all control + characters. A character with value DELIM will always be quoted, it + must be a vanilla ASCII character. A DELIM value of -1 is special: + it disables all quoting of control characters. */ +char * +utf8_to_native (const char *string, size_t length, int delim) +{ + return do_utf8_to_native (string, length, delim, use_iconv); +} |