1 files changed, 329 insertions, 39 deletions
diff --git a/srclib/pcre/doc/pcre.3 b/srclib/pcre/doc/pcre.3
index 4334be2440..738f76b4a9 100644
--- a/srclib/pcre/doc/pcre.3
+++ b/srclib/pcre/doc/pcre.3
@@ -44,6 +44,12 @@ pcre - Perl-compatible regular expressions.
 .B int *\fIovector\fR, int \fIstringcount\fR, "const char ***\fIlistptr\fR);"
 .PP
 .br
+.B void pcre_free_substring(const char *\fIstringptr\fR);
+.PP
+.br
+.B void pcre_free_substring_list(const char **\fIstringptr\fR);
+.PP
+.br
 .B const unsigned char *pcre_maketables(void);
 .PP
 .br
@@ -70,7 +76,9 @@ pcre - Perl-compatible regular expressions.
 The PCRE library is a set of functions that implement regular expression
 pattern matching using the same syntax and semantics as Perl 5, with just a few
 differences (see below). The current implementation corresponds to Perl 5.005,
-with some additional features from the Perl development release.
+with some additional features from later versions. This includes some
+experimental, incomplete support for UTF-8 encoded strings. Details of exactly
+what is and what is not supported are given below.
 
 PCRE has its own native API, which is described in this document. There is also
 a set of wrapper functions that correspond to the POSIX regular expression API.
@@ -84,12 +92,18 @@ contain the major and minor release numbers for the library. Applications can
 use these to include support for different releases.
 
 The functions \fBpcre_compile()\fR, \fBpcre_study()\fR, and \fBpcre_exec()\fR
-are used for compiling and matching regular expressions, while
-\fBpcre_copy_substring()\fR, \fBpcre_get_substring()\fR, and
+are used for compiling and matching regular expressions. A sample program that
+demonstrates the simplest way of using them is given in the file
+\fIpcredemo.c\fR. The last section of this man page describes how to run it.
+
+The functions \fBpcre_copy_substring()\fR, \fBpcre_get_substring()\fR, and
 \fBpcre_get_substring_list()\fR are convenience functions for extracting
-captured substrings from a matched subject string. The function
-\fBpcre_maketables()\fR is used (optionally) to build a set of character tables
-in the current locale for passing to \fBpcre_compile()\fR.
+captured substrings from a matched subject string; \fBpcre_free_substring()\fR
+and \fBpcre_free_substring_list()\fR are also provided, to free the memory used
+for extracted strings.
+
+The function \fBpcre_maketables()\fR is used (optionally) to build a set of
+character tables in the current locale for passing to \fBpcre_compile()\fR.
 
 The function \fBpcre_fullinfo()\fR is used to find out information about a
 compiled pattern; \fBpcre_info()\fR is an obsolete version which returns only
@@ -117,18 +131,22 @@ the same compiled pattern can safely be used by several threads at once.
 The function \fBpcre_compile()\fR is called to compile a pattern into an
 internal form. The pattern is a C string terminated by a binary zero, and
 is passed in the argument \fIpattern\fR. A pointer to a single block of memory
-that is obtained via \fBpcre_malloc\fR is returned. This contains the
-compiled code and related data. The \fBpcre\fR type is defined for this for
-convenience, but in fact \fBpcre\fR is just a typedef for \fBvoid\fR, since the
-contents of the block are not externally defined. It is up to the caller to
-free the memory when it is no longer required.
-.PP
+that is obtained via \fBpcre_malloc\fR is returned. This contains the compiled
+code and related data. The \fBpcre\fR type is defined for the returned block;
+this is a typedef for a structure whose contents are not externally defined. It
+is up to the caller to free the memory when it is no longer required.
+
+Although the compiled code of a PCRE regex is relocatable, that is, it does not
+depend on memory location, the complete \fBpcre\fR data block is not
+fully relocatable, because it contains a copy of the \fItableptr\fR argument,
+which is an address (see below).
+
 The size of a compiled pattern is roughly proportional to the length of the
 pattern string, except that each character class (other than those containing
 just a single character, negated or not) requires 33 bytes, and repeat
 quantifiers with a minimum greater than one or a bounded maximum cause the
 relevant portions of the compiled pattern to be replicated.
-.PP
+
 The \fIoptions\fR argument contains independent bits that affect the
 compilation. It should be zero if no options are required. Some of the options,
 in particular, those that are compatible with Perl, can also be set and unset
@@ -137,19 +155,31 @@ below). For these options, the contents of the \fIoptions\fR argument specifies
 their initial settings at the start of compilation and execution. The
 PCRE_ANCHORED option can be set at the time of matching as well as at compile
 time.
-.PP
+
 If \fIerrptr\fR is NULL, \fBpcre_compile()\fR returns NULL immediately.
 Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fR returns
 NULL, and sets the variable pointed to by \fIerrptr\fR to point to a textual
 error message. The offset from the start of the pattern to the character where
 the error was discovered is placed in the variable pointed to by
 \fIerroffset\fR, which must not be NULL. If it is, an immediate error is given.
-.PP
+
 If the final argument, \fItableptr\fR, is NULL, PCRE uses a default set of
 character tables which are built when it is compiled, using the default C
 locale. Otherwise, \fItableptr\fR must be the result of a call to
 \fBpcre_maketables()\fR. See the section on locale support below.
-.PP
+
+This code fragment shows a typical straightforward call to \fBpcre_compile()\fR:
+
+  pcre *re;
+  const char *error;
+  int erroffset;
+  re = pcre_compile(
+    "^A.*Z",          /* the pattern */
+    0,                /* default options */
+    &error,           /* for error message */
+    &erroffset,       /* for error offset */
+    NULL);            /* use default character tables */
+
 The following option bits are defined in the header file:
 
   PCRE_ANCHORED
@@ -223,15 +253,23 @@ This option inverts the "greediness" of the quantifiers so that they are not
 greedy by default, but become greedy if followed by "?". It is not compatible
 with Perl. It can also be set by a (?U) option setting within the pattern.
 
+  PCRE_UTF8
+
+This option causes PCRE to regard both the pattern and the subject as strings
+of UTF-8 characters instead of just byte strings. However, it is available only
+if PCRE has been built to include UTF-8 support. If not, the use of this option
+provokes an error. Support for UTF-8 is new, experimental, and incomplete.
+Details of exactly what it entails are given below.
+
 
 .SH STUDYING A PATTERN
 When a pattern is going to be used several times, it is worth spending more
 time analyzing it in order to speed up the time taken for matching. The
 function \fBpcre_study()\fR takes a pointer to a compiled pattern as its first
-argument, and returns a pointer to a \fBpcre_extra\fR block (another \fBvoid\fR
-typedef) containing additional information about the pattern; this can be
-passed to \fBpcre_exec()\fR. If no additional information is available, NULL
-is returned.
+argument, and returns a pointer to a \fBpcre_extra\fR block (another typedef
+for a structure with hidden contents) containing additional information about
+the pattern; this can be passed to \fBpcre_exec()\fR. If no additional
+information is available, NULL is returned.
 
 The second argument contains option bits. At present, no options are defined
 for \fBpcre_study()\fR, and this argument should always be zero.
@@ -240,6 +278,14 @@ The third argument for \fBpcre_study()\fR is a pointer to an error message. If
 studying succeeds (even if no data is returned), the variable it points to is
 set to NULL. Otherwise it points to a textual error message.
 
+This is a typical call to \fBpcre_study\fR():
+
+  pcre_extra *pe;
+  pe = pcre_study(
+    re,             /* result of pcre_compile() */
+    0,              /* no options exist */
+    &error);        /* set to NULL or points to a message */
+
 At present, studying a pattern is useful only for non-anchored patterns that do
 not have a single fixed starting character. A bitmap of possible starting
 characters is created.
@@ -289,13 +335,24 @@ the following negative numbers:
   PCRE_ERROR_BADMAGIC   the "magic number" was not found
   PCRE_ERROR_BADOPTION  the value of \fIwhat\fR was invalid
 
+Here is a typical call of \fBpcre_fullinfo()\fR, to obtain the length of the
+compiled pattern:
+
+  int rc;
+  unsigned long int length;
+  rc = pcre_fullinfo(
+    re,               /* result of pcre_compile() */
+    pe,               /* result of pcre_study(), or NULL */
+    PCRE_INFO_SIZE,   /* what is required */
+    &length);         /* where to put the data */
+
 The possible values for the third argument are defined in \fBpcre.h\fR, and are
 as follows:
 
   PCRE_INFO_OPTIONS
 
 Return a copy of the options with which the pattern was compiled. The fourth
-argument should point to au \fBunsigned long int\fR variable. These option bits
+argument should point to an \fBunsigned long int\fR variable. These option bits
 are those specified in the call to \fBpcre_compile()\fR, modified by any
 top-level option settings within the pattern itself, and with the PCRE_ANCHORED
 bit forcibly set if the form of the pattern implies that it can match only at
@@ -376,6 +433,20 @@ pre-compiled pattern, which is passed in the \fIcode\fR argument. If the
 pattern has been studied, the result of the study should be passed in the
 \fIextra\fR argument. Otherwise this must be NULL.
 
+Here is an example of a simple call to \fBpcre_exec()\fR:
+
+  int rc;
+  int ovector[30];
+  rc = pcre_exec(
+    re,             /* result of pcre_compile() */
+    NULL,           /* we didn't study the pattern */
+    "some string",  /* the subject string */
+    11,             /* the length of the subject string */
+    0,              /* start at offset 0 in the subject */
+    0,              /* default options */
+    ovector,        /* vector for substring information */
+    30);            /* number of elements in the vector */
+
 The PCRE_ANCHORED option can be passed in the \fIoptions\fR argument, whose
 unused bits must be zero. However, if a pattern was compiled with
 PCRE_ANCHORED, or turned out to be anchored by virtue of its contents, it
@@ -417,9 +488,9 @@ below) and trying an ordinary match again.
 
 The subject string is passed as a pointer in \fIsubject\fR, a length in
 \fIlength\fR, and a starting offset in \fIstartoffset\fR. Unlike the pattern
-string, it may contain binary zero characters. When the starting offset is
-zero, the search for a match starts at the beginning of the subject, and this
-is by far the most common case.
+string, the subject may contain binary zero characters. When the starting
+offset is zero, the search for a match starts at the beginning of the subject,
+and this is by far the most common case.
 
 A non-zero starting offset is useful when searching for another match in the
 same subject by calling \fBpcre_exec()\fR again after a previous success.
@@ -558,7 +629,7 @@ extract a single substring, whose number is given as \fIstringnumber\fR. A
 value of zero extracts the substring that matched the entire pattern, while
 higher values extract the captured substrings. For \fBpcre_copy_substring()\fR,
 the string is placed in \fIbuffer\fR, whose length is given by
-\fIbuffersize\fR, while for \fBpcre_get_substring()\fR a new block of store is
+\fIbuffersize\fR, while for \fBpcre_get_substring()\fR a new block of memory is
 obtained via \fBpcre_malloc\fR, and its address is returned via
 \fIstringptr\fR. The yield of the function is the length of the string, not
 including the terminating zero, or one of
@@ -590,6 +661,15 @@ string. This can be distinguished from a genuine zero-length substring by
 inspecting the appropriate offset in \fIovector\fR, which is negative for unset
 substrings.
 
+The two convenience functions \fBpcre_free_substring()\fR and
+\fBpcre_free_substring_list()\fR can be used to free the memory returned by
+a previous call of \fBpcre_get_substring()\fR or
+\fBpcre_get_substring_list()\fR, respectively. They do nothing more than call
+the function pointed to by \fBpcre_free\fR, which of course could be called
+directly from a C program. However, PCRE is used in some situations where it is
+linked via a special interface to another programming language which cannot use
+\fBpcre_free\fR directly; it is for these cases that the functions are
+provided.
 
 
 .SH LIMITATIONS
@@ -597,8 +677,9 @@ There are some size limitations in PCRE but it is hoped that they will never in
 practice be relevant.
 The maximum length of a compiled pattern is 65539 (sic) bytes.
 All values in repeating quantifiers must be less than 65536.
-The maximum number of capturing subpatterns is 99.
-The maximum number of all parenthesized subpatterns, including capturing
+There maximum number of capturing subpatterns is 65535.
+There is no limit to the number of non-capturing subpatterns, but the maximum
+depth of nesting of all kinds of parenthesized subpattern, including capturing
 subpatterns, assertions, and other types of subpattern, is 200.
 
 The maximum length of a subject string is the largest positive number that an
@@ -691,8 +772,14 @@ The syntax and semantics of the regular expressions supported by PCRE are
 described below. Regular expressions are also described in the Perl
 documentation and in a number of other books, some of which have copious
 examples. Jeffrey Friedl's "Mastering Regular Expressions", published by
-O'Reilly (ISBN 1-56592-257), covers them in great detail. The description
-here is intended as reference documentation.
+O'Reilly (ISBN 1-56592-257), covers them in great detail.
+
+The description here is intended as reference documentation. The basic
+operation of PCRE is on strings of bytes. However, there is the beginnings of
+some support for UTF-8 character strings. To use this support you must
+configure PCRE to include it, and then call \fBpcre_compile()\fR with the
+PCRE_UTF8 option. How this affects the pattern matching is described in the
+final section of this document.
 
 A regular expression is a pattern that is matched against a subject string from
 left to right. Most characters stand for themselves in a pattern, and match the
@@ -914,7 +1001,7 @@ PCRE_MULTILINE is set.
 
 Note that the sequences \\A, \\Z, and \\z can be used to match the start and
 end of the subject in both modes, and if all branches of a pattern start with
-\\A is it always anchored, whether PCRE_MULTILINE is set or not.
+\\A it is always anchored, whether PCRE_MULTILINE is set or not.
 
 
 .SH FULL STOP (PERIOD, DOT)
@@ -1018,7 +1105,7 @@ negation, which is indicated by a ^ character after the colon. For example,
 
   [12[:^digit:]]
 
-matches "1", "2", or any non-digit. PCRE (and Perl) also recogize the POSIX
+matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the POSIX
 syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not
 supported, and an error is given if they are encountered.
 
@@ -1116,7 +1203,7 @@ For example, if the string "the red king" is matched against the pattern
   the ((red|white) (king|queen))
 
 the captured substrings are "red king", "red", and "king", and are numbered 1,
-2, and 3.
+2, and 3, respectively.
 
 The fact that plain parentheses fulfil two functions is not always helpful.
 There are often times when a grouping subpattern is required without a
@@ -1210,7 +1297,7 @@ to the string
 
   /* first command */  not comment  /* second comment */
 
-fails, because it matches the entire string due to the greediness of the .*
+fails, because it matches the entire string owing to the greediness of the .*
 item.
 
 However, if a quantifier is followed by a question mark, it ceases to be
@@ -1311,7 +1398,7 @@ example, the pattern
 
   (a|b\\1)+
 
-matches any number of "a"s and also "aba", "ababaa" etc. At each iteration of
+matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration of
 the subpattern, the back reference matches the character string corresponding
 to the previous iteration. In order for this to work, the pattern must be such
 that the first iteration does not need to match the back reference. This can be
@@ -1529,9 +1616,10 @@ subpattern, a compile-time error occurs.
 
 There are two kinds of condition. If the text between the parentheses consists
 of a sequence of digits, the condition is satisfied if the capturing subpattern
-of that number has previously matched. Consider the following pattern, which
-contains non-significant white space to make it more readable (assume the
-PCRE_EXTENDED option) and to divide it into three parts for ease of discussion:
+of that number has previously matched. The number must be greater than zero.
+Consider the following pattern, which contains non-significant white space to
+make it more readable (assume the PCRE_EXTENDED option) and to divide it into
+three parts for ease of discussion:
 
   ( \\( )?    [^()]+    (?(1) \\) )
 
@@ -1685,6 +1773,208 @@ with the pattern above. The former gives a failure almost instantly when
 applied to a whole line of "a" characters, whereas the latter takes an
 appreciable time with strings longer than about 20 characters.
 
+
+.SH UTF-8 SUPPORT
+Starting at release 3.3, PCRE has some support for character strings encoded
+in the UTF-8 format. This is incomplete, and is regarded as experimental. In
+order to use it, you must configure PCRE to include UTF-8 support in the code,
+and, in addition, you must call \fBpcre_compile()\fR with the PCRE_UTF8 option
+flag. When you do this, both the pattern and any subject strings that are
+matched against it are treated as UTF-8 strings instead of just strings of
+bytes, but only in the cases that are mentioned below.
+
+If you compile PCRE with UTF-8 support, but do not use it at run time, the
+library will be a bit bigger, but the additional run time overhead is limited
+to testing the PCRE_UTF8 flag in several places, so should not be very large.
+
+PCRE assumes that the strings it is given contain valid UTF-8 codes. It does
+not diagnose invalid UTF-8 strings. If you pass invalid UTF-8 strings to PCRE,
+the results are undefined.
+
+Running with PCRE_UTF8 set causes these changes in the way PCRE works:
+
+1. In a pattern, the escape sequence \\x{...}, where the contents of the braces
+is a string of hexadecimal digits, is interpreted as a UTF-8 character whose
+code number is the given hexadecimal number, for example: \\x{1234}. This
+inserts from one to six literal bytes into the pattern, using the UTF-8
+encoding. If a non-hexadecimal digit appears between the braces, the item is
+not recognized.
+
+2. The original hexadecimal escape sequence, \\xhh, generates a two-byte UTF-8
+character if its value is greater than 127.
+
+3. Repeat quantifiers are NOT correctly handled if they follow a multibyte
+character. For example, \\x{100}* and \\xc3+ do not work. If you want to
+repeat such characters, you must enclose them in non-capturing parentheses,
+for example (?:\\x{100}), at present.
+
+4. The dot metacharacter matches one UTF-8 character instead of a single byte.
+
+5. Unlike literal UTF-8 characters, the dot metacharacter followed by a
+repeat quantifier does operate correctly on UTF-8 characters instead of
+single bytes.
+
+4. Although the \\x{...} escape is permitted in a character class, characters
+whose values are greater than 255 cannot be included in a class.
+
+5. A class is matched against a UTF-8 character instead of just a single byte,
+but it can match only characters whose values are less than 256. Characters
+with greater values always fail to match a class.
+
+6. Repeated classes work correctly on multiple characters.
+
+7. Classes containing just a single character whose value is greater than 127
+(but less than 256), for example, [\\x80] or [^\\x{93}], do not work because
+these are optimized into single byte matches. In the first case, of course,
+the class brackets are just redundant.
+
+8. Lookbehind assertions move backwards in the subject by a fixed number of
+characters instead of a fixed number of bytes. Simple cases have been tested
+to work correctly, but there may be hidden gotchas herein.
+
+9. The character types such as \\d and \\w do not work correctly with UTF-8
+characters. They continue to test a single byte.
+
+10. Anything not explicitly mentioned here continues to work in bytes rather
+than in characters.
+
+The following UTF-8 features of Perl 5.6 are not implemented:
+
+1. The escape sequence \\C to match a single byte.
+
+2. The use of Unicode tables and properties and escapes \\p, \\P, and \\X.
+
+
+.SH SAMPLE PROGRAM
+The code below is a simple, complete demonstration program, to get you started
+with using PCRE. This code is also supplied in the file \fIpcredemo.c\fR in the
+PCRE distribution.
+
+The program compiles the regular expression that is its first argument, and
+matches it against the subject string in its second argument. No options are
+set, and default character tables are used. If matching succeeds, the program
+outputs the portion of the subject that matched, together with the contents of
+any captured substrings.
+
+On a Unix system that has PCRE installed in \fI/usr/local\fR, you can compile
+the demonstration program using a command like this:
+
+  gcc -o pcredemo pcredemo.c -I/usr/local/include -L/usr/local/lib -lpcre
+
+Then you can run simple tests like this:
+
+  ./pcredemo 'cat|dog' 'the cat sat on the mat'
+
+Note that there is a much more comprehensive test program, called
+\fBpcretest\fR, which supports many more facilities for testing regular
+expressions. The \fBpcredemo\fR program is provided as a simple coding example.
+
+On some operating systems (e.g. Solaris) you may get an error like this when
+you try to run \fBpcredemo\fR:
+
+  ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such file or directory
+
+This is caused by the way shared library support works on those systems. You
+need to add
+
+  -R/usr/local/lib
+
+to the compile command to get round this problem. Here's the code:
+
+  #include <stdio.h>
+  #include <string.h>
+  #include <pcre.h>
+
+  #define OVECCOUNT 30    /* should be a multiple of 3 */
+
+  int main(int argc, char **argv)
+  {
+  pcre *re;
+  const char *error;
+  int erroffset;
+  int ovector[OVECCOUNT];
+  int rc, i;
+
+  if (argc != 3)
+    {
+    printf("Two arguments required: a regex and a "
+      "subject string\\n");
+    return 1;
+    }
+
+  /* Compile the regular expression in the first argument */
+
+  re = pcre_compile(
+    argv[1],     /* the pattern */
+    0,           /* default options */
+    &error,      /* for error message */
+    &erroffset,  /* for error offset */
+    NULL);       /* use default character tables */
+
+  /* Compilation failed: print the error message and exit */
+
+  if (re == NULL)
+    {
+    printf("PCRE compilation failed at offset %d: %s\\n",
+      erroffset, error);
+    return 1;
+    }
+
+  /* Compilation succeeded: match the subject in the second
+     argument */
+
+  rc = pcre_exec(
+    re,          /* the compiled pattern */
+    NULL,        /* we didn't study the pattern */
+    argv[2],     /* the subject string */
+    (int)strlen(argv[2]), /* the length of the subject */
+    0,           /* start at offset 0 in the subject */
+    0,           /* default options */
+    ovector,     /* vector for substring information */
+    OVECCOUNT);  /* number of elements in the vector */
+
+  /* Matching failed: handle error cases */
+
+  if (rc < 0)
+    {
+    switch(rc)
+      {
+      case PCRE_ERROR_NOMATCH: printf("No match\\n"); break;
+      /*
+      Handle other special cases if you like
+      */
+      default: printf("Matching error %d\\n", rc); break;
+      }
+    return 1;
+    }
+
+  /* Match succeded */
+
+  printf("Match succeeded\\n");
+
+  /* The output vector wasn't big enough */
+
+  if (rc == 0)
+    {
+    rc = OVECCOUNT/3;
+    printf("ovector only has room for %d captured "
+      substrings\\n", rc - 1);
+    }
+
+  /* Show substrings stored in the output vector */
+
+  for (i = 0; i < rc; i++)
+    {
+    char *substring_start = argv[2] + ovector[2*i];
+    int substring_length = ovector[2*i+1] - ovector[2*i];
+    printf("%2d: %.*s\\n", i, substring_length,
+      substring_start);
+    }
+
+  return 0;
+  }
+
+
 .SH AUTHOR
 Philip Hazel <ph10@cam.ac.uk>
 .br
@@ -1696,6 +1986,6 @@ Cambridge CB2 3QG, England.
 .br
 Phone: +44 1223 334714
 
-Last updated: 27 January 2000
+Last updated: 15 August 2001
 .br
-Copyright (c) 1997-2000 University of Cambridge.
+Copyright (c) 1997-2001 University of Cambridge.