Added iconv support and doc cleanups.

author: Werner Koch <wk@gnupg.org> 2006-09-22 20:15:18 +0200
committer: Werner Koch <wk@gnupg.org> 2006-09-22 20:15:18 +0200
commit: 2db8df0ba388f0a39424b76ce00fe5b51abbd54d (patch)
tree: 20ec3eea82409a6329e241b92c9c16ef07d07456 /jnlib
parent: Improved yat2m. (diff)
download: gnupg2-2db8df0ba388f0a39424b76ce00fe5b51abbd54d.tar.xz
gnupg2-2db8df0ba388f0a39424b76ce00fe5b51abbd54d.zip
2 files changed, 383 insertions, 195 deletions
diff --git a/jnlib/ChangeLog b/jnlib/ChangeLog
index f601d9b14..6c38de2b1 100644
--- a/jnlib/ChangeLog
+++ b/jnlib/ChangeLog
@@ -1,3 +1,9 @@
+2006-09-22  Werner Koch  <wk@g10code.com>
+
+	* utf8conv.c: Reworked to match the gnupg 1.4.5 code.  This now
+	requires iconv support but this is reasonable for all modern
+	systems.
+
 2006-08-29  Werner Koch  <wk@g10code.com>
 
 	* logging.c (do_logv): Emit a missing LF for fatal errors.
diff --git a/jnlib/utf8conv.c b/jnlib/utf8conv.c
index 9fba1ed4f..ebb6ef3fd 100644
--- a/jnlib/utf8conv.c
+++ b/jnlib/utf8conv.c
@@ -28,101 +28,225 @@
 #ifdef HAVE_LANGINFO_CODESET
 #include <langinfo.h>
 #endif
+#include <errno.h>
+#include <iconv.h>
 
 #include "libjnlib-config.h"
 #include "stringhelp.h"
 #include "utf8conv.h"
 
+#ifndef MB_LEN_MAX
+#define MB_LEN_MAX 16
+#endif
+
+static const char *active_charset_name = "iso-8859-1";
+static unsigned short *active_charset;
+static int no_translation;     /* Set to true if we let simply pass through. */
+static int use_iconv;          /* iconv comversion fucntions required. */
 
-static ushort koi8_unicode[128] = {
-  0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524,
-  0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
-  0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2219, 0x221a, 0x2248,
-  0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7,
-  0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556,
-  0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 0x255e,
-  0x255f, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565,
-  0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x00a9,
-  0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433,
-  0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e,
-  0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432,
-  0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a,
-  0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413,
-  0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e,
-  0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412,
-  0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a
-};
-
-static ushort latin2_unicode[128] = {
-  0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
-  0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
-  0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
-  0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
-  0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7,
-  0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B,
-  0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7,
-  0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C,
-  0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7,
-  0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E,
-  0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7,
-  0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF,
-  0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7,
-  0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F,
-  0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7,
-  0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9
-};
 
 
-static const char *active_charset_name = "iso-8859-1";
-static ushort *active_charset = NULL;
-static int no_translation = 0;
+/* Error handler for iconv failures. This is needed to not clutter the
+   output with repeated diagnostics about a missing conversion. */
+static void
+handle_iconv_error (const char *to, const char *from, int use_fallback)
+{
+  if (errno == EINVAL)
+    {
+      static int shown1, shown2;
+      int x;
+
+      if (to && !strcmp (to, "utf-8"))
+        {
+          x = shown1;
+          shown1 = 1;
+        }
+      else
+        {
+          x = shown2;
+          shown2 = 1;
+        }
+
+      if (!x)
+        log_info (_("conversion from `%s' to `%s' not available\n"),
+                  from, to);
+    }
+  else
+    {
+      static int shown;
+
+      if (!shown)
+        log_info (_("iconv_open failed: %s\n"), strerror (errno));
+      shown = 1;
+    }
+
+  if (use_fallback)
+    {
+      /* To avoid further error messages we fallback to Latin-1 for the
+         native encoding.  This is justified as one can expect that on a
+         utf-8 enabled system nl_langinfo() will work and thus we won't
+         never get to here.  Thus Latin-1 seems to be a reasonable
+         default.  */
+      active_charset_name = "iso-8859-1";
+      no_translation = 0;
+      active_charset = NULL;
+      use_iconv = 0;
+    }
+}
+
 
 int
 set_native_charset (const char *newset)
 {
-  if (!newset)
+  const char *full_newset;
+
+  if (!newset) 
+    {
+#ifdef HABE_W32_SYSTEM
+      static char codepage[30];
+      unsigned int cpno;
+      const char *aliases;
+      
+      /* We are a console program thus we need to use the
+         GetConsoleOutputCP function and not the the GetACP which
+         would give the codepage for a GUI program.  Note this is not
+         a bulletproof detection because GetConsoleCP might return a
+         different one for console input.  Not sure how to cope with
+         that.  If the console Code page is not known we fall back to
+         the system code page.  */
+      cpno = GetConsoleOutputCP ();
+      if (!cpno)
+        cpno = GetACP ();
+      sprintf (codepage, "CP%u", cpno );
+      /* Resolve alias.  We use a long string string and not the usual
+         array to optimize if the code is taken to a DSO.  Taken from
+         libiconv 1.9.2. */
+      newset = codepage;
+      for (aliases = ("CP936"   "\0" "GBK" "\0"
+                      "CP1361"  "\0" "JOHAB" "\0"
+                      "CP20127" "\0" "ASCII" "\0"
+                      "CP20866" "\0" "KOI8-R" "\0"
+                      "CP21866" "\0" "KOI8-RU" "\0"
+                      "CP28591" "\0" "ISO-8859-1" "\0"
+                      "CP28592" "\0" "ISO-8859-2" "\0"
+                      "CP28593" "\0" "ISO-8859-3" "\0"
+                      "CP28594" "\0" "ISO-8859-4" "\0"
+                      "CP28595" "\0" "ISO-8859-5" "\0"
+                      "CP28596" "\0" "ISO-8859-6" "\0"
+                      "CP28597" "\0" "ISO-8859-7" "\0"
+                      "CP28598" "\0" "ISO-8859-8" "\0"
+                      "CP28599" "\0" "ISO-8859-9" "\0"
+                      "CP28605" "\0" "ISO-8859-15" "\0"
+                      "CP65001" "\0" "UTF-8" "\0");
+           *aliases;
+           aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
+        {
+          if (!strcmp (codepage, aliases) ||(*aliases == '*' && !aliases[1]))
+            {
+              newset = aliases + strlen (aliases) + 1;
+              break;
+            }
+        }
+
+#else /*!HAVE_W32_SYSTEM*/
+      
 #ifdef HAVE_LANGINFO_CODESET
-    newset = nl_langinfo (CODESET);
-#else
-    newset = "8859-1";
-#endif
+      newset = nl_langinfo (CODESET);
+#else /*!HAVE_LANGINFO_CODESET*/
+      /* Try to get the used charset from environment variables.  */
+      static char codepage[30];
+      const char *lc, *dot, *mod;
+
+      strcpy (codepage, "iso-8859-1");
+      lc = getenv ("LC_ALL");
+      if (!lc || !*lc)
+        {
+          lc = getenv ("LC_CTYPE");
+          if (!lc || !*lc)
+            lc = getenv ("LANG");
+        }
+      if (lc && *lc)
+        {
+          dot = strchr (lc, '.');
+          if (dot)
+            {
+              mod = strchr (++dot, '@');
+              if (!mod)
+                mod = dot + strlen (dot);
+              if (mod - dot < sizeof codepage && dot != mod) 
+                {
+                  memcpy (codepage, dot, mod - dot);
+                  codepage [mod - dot] = 0;
+                }
+            }
+        }
+      newset = codepage;
+#endif /*!HAVE_LANGINFO_CODESET*/
+#endif /*!HAVE_W32_SYSTEM*/
+    }
 
+  full_newset = newset;
   if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3))
     {
       newset += 3;
       if (*newset == '-' || *newset == '_')
-	newset++;
+        newset++;
     }
 
-  if (!*newset
-      || !ascii_strcasecmp (newset, "8859-1")
-      || !ascii_strcasecmp (newset, "8859-15"))
+  /* Note that we silently assume that plain ASCII is actually meant
+     as Latin-1.  This makes sense because many Unix system don't have
+     their locale set up properly and thus would get annoying error
+     messages and we have to handle all the "bug" reports. Latin-1 has
+     always been the character set used for 8 bit characters on Unix
+     systems. */
+  if ( !*newset
+       || !ascii_strcasecmp (newset, "8859-1" )
+       || !ascii_strcasecmp (newset, "646" )
+       || !ascii_strcasecmp (newset, "ASCII" )
+       || !ascii_strcasecmp (newset, "ANSI_X3.4-1968" )
+       )
     {
       active_charset_name = "iso-8859-1";
       no_translation = 0;
       active_charset = NULL;
+      use_iconv = 0;
     }
-  else if (!ascii_strcasecmp (newset, "8859-2"))
-    {
-      active_charset_name = "iso-8859-2";
-      no_translation = 0;
-      active_charset = latin2_unicode;
-    }
-  else if (!ascii_strcasecmp (newset, "koi8-r"))
-    {
-      active_charset_name = "koi8-r";
-      no_translation = 0;
-      active_charset = koi8_unicode;
-    }
-  else if (!ascii_strcasecmp (newset, "utf8")
-	   || !ascii_strcasecmp (newset, "utf-8"))
+  else if ( !ascii_strcasecmp (newset, "utf8" )
+            || !ascii_strcasecmp(newset, "utf-8") )
     {
       active_charset_name = "utf-8";
       no_translation = 1;
       active_charset = NULL;
+      use_iconv = 0;
     }
   else
-    return -1;
+    {
+      iconv_t cd;
+      
+#ifdef HAVE_W32_SYSTEM
+      if (load_libiconv ())
+        return -1;
+#endif /*HAVE_W32_SYSTEM*/      
+
+      cd = iconv_open (full_newset, "utf-8");
+      if (cd == (iconv_t)-1) 
+        {
+          handle_iconv_error (full_newset, "utf-8", 0);
+          return -1;
+        }
+      iconv_close (cd);
+      cd = iconv_open ("utf-8", full_newset);
+      if (cd == (iconv_t)-1) 
+        {
+          handle_iconv_error ("utf-8", full_newset, 0);
+          return -1;
+        }
+      iconv_close (cd);
+      active_charset_name = full_newset;
+      no_translation = 0;
+      active_charset = NULL; 
+      use_iconv = 1;
+    }
   return 0;
 }
 
@@ -132,10 +256,9 @@ get_native_charset ()
   return active_charset_name;
 }
 
-/****************
- * Convert string, which is in native encoding to UTF8 and return the
- * new allocated UTF8 string.
- */
+
+/* Convert string, which is in native encoding to UTF8 and return a
+   new allocated UTF-8 string.  */
 char *
 native_to_utf8 (const char *orig_string)
 {
@@ -147,41 +270,12 @@ native_to_utf8 (const char *orig_string)
 
   if (no_translation)
     {
+      /* Already utf-8 encoded. */
       buffer = jnlib_xstrdup (orig_string);
     }
-  else if (active_charset)
-    {
-      for (s = string; *s; s++)
-	{
-	  length++;
-	  if (*s & 0x80)
-	    length += 2;	/* we may need 3 bytes */
-	}
-      buffer = jnlib_xmalloc (length + 1);
-      for (p = (unsigned char *)buffer, s = string; *s; s++)
-	{
-	  if ((*s & 0x80))
-	    {
-	      ushort val = active_charset[*s & 0x7f];
-	      if (val < 0x0800)
-		{
-		  *p++ = 0xc0 | ((val >> 6) & 0x1f);
-		  *p++ = 0x80 | (val & 0x3f);
-		}
-	      else
-		{
-		  *p++ = 0xe0 | ((val >> 12) & 0x0f);
-		  *p++ = 0x80 | ((val >> 6) & 0x3f);
-		  *p++ = 0x80 | (val & 0x3f);
-		}
-	    }
-	  else
-	    *p++ = *s;
-	}
-      *p = 0;
-    }
-  else
+  else if (!active_charset && !use_iconv)
     {
+      /* For Latin-1 we can avoid the iconv overhead. */
       for (s = string; *s; s++)
 	{
 	  length++;
@@ -191,7 +285,7 @@ native_to_utf8 (const char *orig_string)
       buffer = jnlib_xmalloc (length + 1);
       for (p = (unsigned char *)buffer, s = string; *s; s++)
 	{
-	  if (*s & 0x80)
+	  if ( (*s & 0x80 ))
 	    {
 	      *p++ = 0xc0 | ((*s >> 6) & 3);
 	      *p++ = 0x80 | (*s & 0x3f);
@@ -201,22 +295,68 @@ native_to_utf8 (const char *orig_string)
 	}
       *p = 0;
     }
+  else
+    { 
+      /* Need to use iconv.  */
+      iconv_t cd;
+      const char *inptr;
+      char *outptr;
+      size_t inbytes, outbytes;
+     
+      cd = iconv_open ("utf-8", active_charset_name);
+      if (cd == (iconv_t)-1)
+        {
+          handle_iconv_error ("utf-8", active_charset_name, 1);
+          return native_to_utf8 (string);
+        }
+
+      for (s=string; *s; s++ ) 
+        {
+          length++;
+          if ((*s & 0x80))
+            length += 5; /* We may need up to 6 bytes for the utf8 output. */
+        }
+      buffer = jnlib_xmalloc (length + 1);
+      
+      inptr = string;
+      inbytes = strlen (string);
+      outptr = buffer;
+      outbytes = length;
+      if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes,
+                  &outptr, &outbytes) == (size_t)-1)
+        {
+          static int shown;
+
+          if (!shown)
+            log_info (_("conversion from `%s' to `%s' failed: %s\n"),
+                      active_charset_name, "utf-8", strerror (errno));
+          shown = 1;
+          /* We don't do any conversion at all but use the strings as is. */
+          strcpy (buffer, string);
+        }
+      else /* Success.  */
+        {
+          *outptr = 0;
+          /* We could realloc the buffer now but I doubt that it makes
+             much sense given that it will get freed anyway soon
+             after.  */
+        }
+      iconv_close (cd);
+    }
   return buffer;
 }
 
 
-/* Convert string, which is in UTF8 to native encoding.  Replace
- * illegal encodings by some "\xnn" and quote all control
- * characters. A character with value DELIM will always be quoted, it
- * must be a vanilla ASCII character.  */
-char *
-utf8_to_native (const char *string, size_t length, int delim)
+
+static char *
+do_utf8_to_native (const char *string, size_t length, int delim,
+                   int with_iconv)
 {
   int nleft;
   int i;
   unsigned char encbuf[8];
   int encidx;
-  const byte *s;
+  const unsigned char *s;
   size_t n;
   char *buffer = NULL;
   char *p = NULL;
@@ -224,19 +364,20 @@ utf8_to_native (const char *string, size_t length, int delim)
   size_t slen;
   int resync = 0;
 
-  /* 1. pass (p==NULL): count the extended utf-8 characters */
-  /* 2. pass (p!=NULL): create string */
+  /* First pass (p==NULL): count the extended utf-8 characters.  */
+  /* Second pass (p!=NULL): create string.  */
   for (;;)
     {
       for (slen = length, nleft = encidx = 0, n = 0,
-             s = (const unsigned char *)string; slen;
+             s = (const unsigned char *)string;
+           slen;
 	   s++, slen--)
 	{
 	  if (resync)
 	    {
 	      if (!(*s < 128 || (*s >= 0xc0 && *s <= 0xfd)))
 		{
-		  /* still invalid */
+		  /* Still invalid. */
 		  if (p)
 		    {
 		      sprintf (p, "\\x%02x", *s);
@@ -250,45 +391,23 @@ utf8_to_native (const char *string, size_t length, int delim)
 	  if (!nleft)
 	    {
 	      if (!(*s & 0x80))
-		{		/* plain ascii */
-		  if (*s < 0x20 || *s == 0x7f || *s == delim ||
-		      (delim && *s == '\\'))
+		{	
+                  /* Plain ascii. */
+		  if ( delim != -1
+                       && (*s < 0x20 || *s == 0x7f || *s == delim 
+                           || (delim && *s == '\\')))
 		    {
 		      n++;
 		      if (p)
 			*p++ = '\\';
 		      switch (*s)
 			{
-			case '\n':
-			  n++;
-			  if (p)
-			    *p++ = 'n';
-			  break;
-			case '\r':
-			  n++;
-			  if (p)
-			    *p++ = 'r';
-			  break;
-			case '\f':
-			  n++;
-			  if (p)
-			    *p++ = 'f';
-			  break;
-			case '\v':
-			  n++;
-			  if (p)
-			    *p++ = 'v';
-			  break;
-			case '\b':
-			  n++;
-			  if (p)
-			    *p++ = 'b';
-			  break;
-			case 0:
-			  n++;
-			  if (p)
-			    *p++ = '0';
-			  break;
+                        case '\n': n++; if ( p ) *p++ = 'n'; break;
+                        case '\r': n++; if ( p ) *p++ = 'r'; break;
+                        case '\f': n++; if ( p ) *p++ = 'f'; break;
+                        case '\v': n++; if ( p ) *p++ = 'v'; break;
+                        case '\b': n++; if ( p ) *p++ = 'b'; break;
+                        case    0: n++; if ( p ) *p++ = '0'; break;
 			default:
 			  n += 3;
 			  if (p)
@@ -306,43 +425,43 @@ utf8_to_native (const char *string, size_t length, int delim)
 		      n++;
 		    }
 		}
-	      else if ((*s & 0xe0) == 0xc0)
-		{		/* 110x xxxx */
+	      else if ((*s & 0xe0) == 0xc0) /* 110x xxxx */
+		{
 		  val = *s & 0x1f;
 		  nleft = 1;
 		  encidx = 0;
 		  encbuf[encidx++] = *s;
 		}
-	      else if ((*s & 0xf0) == 0xe0)
-		{		/* 1110 xxxx */
+	      else if ((*s & 0xf0) == 0xe0) /* 1110 xxxx */
+		{	
 		  val = *s & 0x0f;
 		  nleft = 2;
 		  encidx = 0;
 		  encbuf[encidx++] = *s;
 		}
-	      else if ((*s & 0xf8) == 0xf0)
-		{		/* 1111 0xxx */
+	      else if ((*s & 0xf8) == 0xf0) /* 1111 0xxx */
+		{	
 		  val = *s & 0x07;
 		  nleft = 3;
 		  encidx = 0;
 		  encbuf[encidx++] = *s;
 		}
-	      else if ((*s & 0xfc) == 0xf8)
-		{		/* 1111 10xx */
+	      else if ((*s & 0xfc) == 0xf8) /* 1111 10xx */
+		{	
 		  val = *s & 0x03;
 		  nleft = 4;
 		  encidx = 0;
 		  encbuf[encidx++] = *s;
 		}
-	      else if ((*s & 0xfe) == 0xfc)
-		{		/* 1111 110x */
+	      else if ((*s & 0xfe) == 0xfc) /* 1111 110x */
+		{		
 		  val = *s & 0x01;
 		  nleft = 5;
 		  encidx = 0;
 		  encbuf[encidx++] = *s;
 		}
-	      else
-		{		/* invalid encoding: print as \xnn */
+	      else /* Invalid encoding: print as \xNN. */
+		{		
 		  if (p)
 		    {
 		      sprintf (p, "\\x%02x", *s);
@@ -352,8 +471,8 @@ utf8_to_native (const char *string, size_t length, int delim)
 		  resync = 1;
 		}
 	    }
-	  else if (*s < 0x80 || *s >= 0xc0)
-	    {			/* invalid */
+	  else if (*s < 0x80 || *s >= 0xc0) /* Invalid utf-8 */
+	    {
 	      if (p)
 		{
 		  for (i = 0; i < encidx; i++)
@@ -374,8 +493,8 @@ utf8_to_native (const char *string, size_t length, int delim)
 	      encbuf[encidx++] = *s;
 	      val <<= 6;
 	      val |= *s & 0x3f;
-	      if (!--nleft)
-		{		/* ready */
+	      if (!--nleft)  /* Ready. */
+		{ 
 		  if (no_translation)
 		    {
 		      if (p)
@@ -386,43 +505,41 @@ utf8_to_native (const char *string, size_t length, int delim)
 		      n += encidx;
 		      encidx = 0;
 		    }
-		  else if (active_charset)
-		    {		/* table lookup */
-		      for (i = 0; i < 128; i++)
-			{
-			  if (active_charset[i] == val)
-			    break;
-			}
-		      if (i < 128)
-			{	/* we can print this one */
-			  if (p)
-			    *p++ = i + 128;
-			  n++;
-			}
-		      else
-			{	/* we do not have a translation: print utf8 */
-			  if (p)
-			    {
-			      for (i = 0; i < encidx; i++)
-				{
-				  sprintf (p, "\\x%02x", encbuf[i]);
-				  p += 4;
-				}
-			    }
-			  n += encidx * 4;
-			  encidx = 0;
-			}
-		    }
-		  else
-		    {		/* native set */
+                  else if (with_iconv)
+                    {
+                      /* Our strategy for using iconv is a bit strange
+                         but it better keeps compatibility with
+                         previous versions in regard to how invalid
+                         encodings are displayed.  What we do is to
+                         keep the utf-8 as is and have the real
+                         translation step then at the end.  Yes, I
+                         know that this is ugly.  However we are short
+                         of the 1.4 release and for this branch we
+                         should not mess too much around with iconv
+                         things.  One reason for this is that we don't
+                         know enough about non-GNU iconv
+                         implementation and want to minimize the risk
+                         of breaking the code on too many platforms.  */
+                        if ( p )
+                          {
+                            for (i=0; i < encidx; i++ )
+                              *p++ = encbuf[i];
+                          }
+                        n += encidx;
+                        encidx = 0;
+                    }
+		  else 	/* Latin-1 case. */
+                    {
 		      if (val >= 0x80 && val < 256)
 			{
-			  n++;	/* we can simply print this character */
+                          /* We can simply print this character */
+			  n++;	
 			  if (p)
 			    *p++ = val;
 			}
 		      else
-			{	/* we do not have a translation: print utf8 */
+			{	
+                          /* We do not have a translation: print utf8. */
 			  if (p)
 			    {
 			      for (i = 0; i < encidx; i++)
@@ -440,13 +557,78 @@ utf8_to_native (const char *string, size_t length, int delim)
 	    }
 	}
       if (!buffer)
-	{			/* allocate the buffer after the first pass */
+	{
+          /* Allocate the buffer after the first pass. */
 	  buffer = p = jnlib_xmalloc (n + 1);
 	}
-      else
+      else if (with_iconv)
+        {
+          /* Note: See above for comments.  */
+          iconv_t cd;
+          const char *inptr;
+          char *outbuf, *outptr;
+          size_t inbytes, outbytes;
+          
+          *p = 0;  /* Terminate the buffer. */
+
+          cd = iconv_open (active_charset_name, "utf-8");
+          if (cd == (iconv_t)-1)
+            {
+              handle_iconv_error (active_charset_name, "utf-8", 1);
+              jnlib_free (buffer);
+              return utf8_to_native (string, length, delim);
+            }
+
+          /* Allocate a new buffer large enough to hold all possible
+             encodings. */
+          n = p - buffer + 1;
+          inbytes = n - 1;;
+          inptr = buffer;
+          outbytes = n * MB_LEN_MAX;
+          if (outbytes / MB_LEN_MAX != n) 
+            BUG (); /* Actually an overflow. */
+          outbuf = outptr = jnlib_xmalloc (outbytes);
+          if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes,
+                      &outptr, &outbytes) == (size_t)-1) 
+            {
+              static int shown;
+              
+              if (!shown)
+                log_info (_("conversion from `%s' to `%s' failed: %s\n"),
+                          "utf-8", active_charset_name, strerror (errno));
+              shown = 1;
+              /* Didn't worked out.  Try again but without iconv.  */
+              jnlib_free (buffer);
+              buffer = NULL;
+              jnlib_free (outbuf);
+              outbuf = do_utf8_to_native (string, length, delim, 0);
+            }
+            else /* Success.  */
+              { 
+                *outptr = 0; /* Make sure it is a string. */
+                /* We could realloc the buffer now but I doubt that it
+                   makes much sense given that it will get freed
+                   anyway soon after.  */
+                jnlib_free (buffer);
+              }
+          iconv_close (cd);
+          return outbuf;
+        }
+      else /* Not using iconv. */
 	{
-	  *p = 0;		/* make a string */
+	  *p = 0; /* Make sure it is a string. */
 	  return buffer;
 	}
     }
 }
+
+/* Convert string, which is in UTF-8 to native encoding.  Replace
+   illegal encodings by some "\xnn" and quote all control
+   characters. A character with value DELIM will always be quoted, it
+   must be a vanilla ASCII character.  A DELIM value of -1 is special:
+   it disables all quoting of control characters. */
+char *
+utf8_to_native (const char *string, size_t length, int delim)
+{
+  return do_utf8_to_native (string, length, delim, use_iconv);
+}
author	Werner Koch <wk@gnupg.org>	2006-09-22 20:15:18 +0200
committer	Werner Koch <wk@gnupg.org>	2006-09-22 20:15:18 +0200
commit	2db8df0ba388f0a39424b76ce00fe5b51abbd54d (patch)
tree	20ec3eea82409a6329e241b92c9c16ef07d07456 /jnlib
parent	Improved yat2m. (diff)
download	gnupg2-2db8df0ba388f0a39424b76ce00fe5b51abbd54d.tar.xz gnupg2-2db8df0ba388f0a39424b76ce00fe5b51abbd54d.zip