summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Kew <niq@apache.org>2010-09-21 20:42:20 +0200
committerNick Kew <niq@apache.org>2010-09-21 20:42:20 +0200
commitd4963eadb1091258de4e18b3ac56fe67a3c6d7aa (patch)
treee70d1c85f13801b0067d70fdf353e37a9d085d44
parentReformat these <pre> tables as actual HTML tables (diff)
downloadapache2-d4963eadb1091258de4e18b3ac56fe67a3c6d7aa.tar.xz
apache2-d4963eadb1091258de4e18b3ac56fe67a3c6d7aa.zip
Introduce ap_rxplus class: higher-level regexps supporting perl-style
regexp operations. git-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@999533 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r--CHANGES4
-rw-r--r--include/ap_regex.h80
-rw-r--r--server/Makefile.in2
-rw-r--r--server/util_pcre.c1
-rw-r--r--server/util_regex.c261
5 files changed, 346 insertions, 2 deletions
diff --git a/CHANGES b/CHANGES
index 9acc74b41c..32e5c20e90 100644
--- a/CHANGES
+++ b/CHANGES
@@ -64,6 +64,10 @@ Changes with Apache 2.3.9
and sub-directories of matched directories are no longer implicitly
matched. PR49809 [Eric Covener]
+ *) Regexps: introduce new higher-level regexp utility including parsing
+ and executing perl-style regexp ops (e.g s/foo/bar/i) and regexp memory
+ [Nick Kew]
+
Changes with Apache 2.3.8
*) suexec: Support large log files. PR 45856. [Stefan Fritsch]
diff --git a/include/ap_regex.h b/include/ap_regex.h
index 91c0f49b31..9882c3fc5f 100644
--- a/include/ap_regex.h
+++ b/include/ap_regex.h
@@ -63,7 +63,7 @@ POSSIBILITY OF SUCH DAMAGE.
extern "C" {
#endif
-/* Options for ap_regexec: */
+/* Options for ap_regcomp, ap_regexec, and ap_rxplus versions: */
#define AP_REG_ICASE 0x01 /** use a case-insensitive match */
#define AP_REG_NEWLINE 0x02 /** don't match newlines against '.' etc */
@@ -73,6 +73,10 @@ extern "C" {
#define AP_REG_EXTENDED (0) /** unused */
#define AP_REG_NOSUB (0) /** unused */
+#define AP_REG_MULTI 0x10 /* perl's /g (needs fixing) */
+#define AP_REG_NOMEM 0x20 /* nomem in our code */
+#define AP_REG_DOTALL 0x40 /* perl's /s flag */
+
/* Error values: */
enum {
AP_REG_ASSERT = 1, /** internal error ? */
@@ -134,6 +138,80 @@ AP_DECLARE(apr_size_t) ap_regerror(int errcode, const ap_regex_t *preg,
*/
AP_DECLARE(void) ap_regfree(ap_regex_t *preg);
+/* ap_rxplus: higher-level regexps */
+
+typedef struct {
+ ap_regex_t rx;
+ apr_uint32_t flags;
+ const char *subs;
+ const char *match;
+ apr_size_t nmatch;
+ ap_regmatch_t *pmatch;
+} ap_rxplus_t;
+
+/**
+ * Compile a pattern into a regexp.
+ * supports perl-like formats
+ * match-string
+ * /match-string/flags
+ * s/match-string/replacement-string/flags
+ * Intended to support more perl-like stuff as and when round tuits happen
+ * match-string is anything supported by ap_regcomp
+ * replacement-string is a substitution string as supported in ap_pregsub
+ * flags should correspond with perl syntax: treat failure to do so as a bug
+ * (documentation TBD)
+ * @param pool Pool to allocate from
+ * @param pattern Pattern to compile
+ * @return Compiled regexp, or NULL in case of compile/syntax error
+ */
+AP_DECLARE(ap_rxplus_t*) ap_rxplus_compile(apr_pool_t *pool, const char *pattern);
+/**
+ * Apply a regexp operation to a string.
+ * @param pool Pool to allocate from
+ * @param rx The regex match to apply
+ * @param pattern The string to apply it to
+ * NOTE: This MUST be kept in scope to use regexp memory
+ * @param newpattern The modified string (ignored if the operation doesn't
+ * modify the string)
+ * @return Number of times a match happens. Normally 0 (no match) or 1
+ * (match found), but may be greater if a transforming pattern
+ * is applied with the 'g' flag.
+ */
+AP_DECLARE(int) ap_rxplus_exec(apr_pool_t *pool, ap_rxplus_t *rx,
+ const char *pattern, char **newpattern);
+#ifdef DOXYGEN
+/**
+ * Number of matches in the regexp operation's memory
+ * This may be 0 if no match is in memory, or up to nmatch from compilation
+ * @param rx The regexp
+ * @return Number of matches in memory
+ */
+AP_DECLARE(int) ap_rxplus_nmatch(ap_rxplus_t *rx);
+#else
+#define ap_rxplus_nmatch(rx) (((rx)->match != NULL) ? (rx)->nmatch : 0)
+#endif
+/**
+ * Get a pointer to a match from regex memory
+ * NOTE: this relies on the match pattern from the last call to
+ * ap_rxplus_exec still being valid (i.e. not freed or out-of-scope)
+ * @param rx The regexp
+ * @param n The match number to retrieve (must be between 0 and nmatch)
+ * @param len Returns the length of the match.
+ * @param match Returns the match pattern
+ */
+AP_DECLARE(void) ap_rxplus_match(ap_rxplus_t *rx, int n, int *len,
+ const char **match);
+/**
+ * Get a match from regex memory in a string copy
+ * NOTE: this relies on the match pattern from the last call to
+ * ap_rxplus_exec still being valid (i.e. not freed or out-of-scope)
+ * @param pool Pool to allocate from
+ * @param rx The regexp
+ * @param n The match number to retrieve (must be between 0 and nmatch)
+ * @return The matched string
+ */
+AP_DECLARE(char*) ap_rxplus_pmatch(apr_pool_t *pool, ap_rxplus_t *rx, int n);
+
#ifdef __cplusplus
} /* extern "C" */
#endif
diff --git a/server/Makefile.in b/server/Makefile.in
index 17e5d13995..10b41446a5 100644
--- a/server/Makefile.in
+++ b/server/Makefile.in
@@ -12,7 +12,7 @@ LTLIBRARY_SOURCES = \
util_script.c util_md5.c util_cfgtree.c util_ebcdic.c util_time.c \
connection.c listen.c util_mutex.c mpm_common.c mpm_unix.c \
util_charset.c util_cookies.c util_debug.c util_xml.c \
- util_expr.c util_filter.c util_pcre.c exports.c \
+ util_expr.c util_filter.c util_pcre.c util_regex.c exports.c \
scoreboard.c error_bucket.c protocol.c core.c request.c provider.c \
eoc_bucket.c eor_bucket.c core_filters.c
diff --git a/server/util_pcre.c b/server/util_pcre.c
index 089c6d360b..8309d5a4a8 100644
--- a/server/util_pcre.c
+++ b/server/util_pcre.c
@@ -128,6 +128,7 @@ int options = 0;
if ((cflags & AP_REG_ICASE) != 0) options |= PCRE_CASELESS;
if ((cflags & AP_REG_NEWLINE) != 0) options |= PCRE_MULTILINE;
+if ((cflags & AP_REG_DOTALL) != 0) options |= PCRE_DOTALL;
preg->re_pcre = pcre_compile(pattern, options, &errorptr, &erroffset, NULL);
preg->re_erroffset = erroffset;
diff --git a/server/util_regex.c b/server/util_regex.c
new file mode 100644
index 0000000000..458e4f6f14
--- /dev/null
+++ b/server/util_regex.c
@@ -0,0 +1,261 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "apr.h"
+#include "apr_lib.h"
+#include "apr_pools.h"
+#include "apr_strings.h"
+#include "ap_config.h"
+#include "ap_regex.h"
+#include "httpd.h"
+
+AP_DECLARE(ap_rxplus_t*) ap_rxplus_compile(apr_pool_t *pool,
+ const char *pattern)
+{
+ /* perl style patterns
+ * add support for more as and when wanted
+ * substitute: s/rx/subs/
+ * match: m/rx/ or just /rx/
+ */
+
+ /* allow any nonalnum delimiter as first or second char.
+ * If we ever use this with non-string pattern we'll need an extra check
+ */
+ const char *endp = 0;
+ const char *str = pattern;
+ const char *rxstr;
+ ap_rxplus_t *ret = apr_pcalloc(pool, sizeof(ap_rxplus_t));
+ char delim = 0;
+ enum { SUBSTITUTE = 's', MATCH = 'm'} action = MATCH;
+ if (!apr_isalnum(pattern[0])) {
+ delim = *str++;
+ }
+ else if (pattern[0] == 's' && !apr_isalnum(pattern[1])) {
+ action = SUBSTITUTE;
+ delim = pattern[1];
+ str += 2;
+ }
+ else if (pattern[0] == 'm' && !apr_isalnum(pattern[1])) {
+ delim = pattern[1];
+ str += 2;
+ }
+ /* TODO: support perl's after/before */
+ /* FIXME: fix these simplminded delims */
+
+ /* we think there's a delimiter. Allow for it not to be if unmatched */
+ if (delim) {
+ endp = ap_strchr_c(str, delim);
+ }
+ if (!endp) { /* there's no delim or flags */
+ if (ap_regcomp(&ret->rx, pattern, 0) == 0) {
+ apr_pool_cleanup_register(pool, &ret->rx, (void*) ap_regfree,
+ apr_pool_cleanup_null);
+ return ret;
+ }
+ else {
+ return NULL;
+ }
+ }
+
+ /* We have a delimiter. Use it to extract the regexp */
+ rxstr = apr_pstrndup(pool, str, endp-str);
+
+ /* If it's a substitution, we need the replacement string
+ * TODO: possible future enhancement - support other parsing
+ * in the replacement string.
+ */
+ if (action == SUBSTITUTE) {
+ str = endp+1;
+ if (!*str || (endp = ap_strchr_c(str, delim), !endp)) {
+ /* missing replacement string is an error */
+ return NULL;
+ }
+ ret->subs = apr_pstrndup(pool, str, (endp-str));
+ }
+
+ /* anything after the current delimiter is flags */
+ while (*++endp) {
+ switch (*endp) {
+ case 'i': ret->flags |= AP_REG_ICASE; break;
+ case 'm': ret->flags |= AP_REG_NEWLINE; break;
+ case 'n': ret->flags |= AP_REG_NOMEM; break;
+ case 'g': ret->flags |= AP_REG_MULTI; break;
+ case 's': ret->flags |= AP_REG_DOTALL; break;
+ case '^': ret->flags |= AP_REG_NOTBOL; break;
+ case '$': ret->flags |= AP_REG_NOTEOL; break;
+ default: break; /* we should probably be stricter here */
+ }
+ }
+ if (ap_regcomp(&ret->rx, rxstr, ret->flags) == 0) {
+ apr_pool_cleanup_register(pool, &ret->rx, (void*) ap_regfree,
+ apr_pool_cleanup_null);
+ }
+ else {
+ return NULL;
+ }
+ if (!(ret->flags & AP_REG_NOMEM)) {
+ /* count size of memory required, starting at 1 for the whole-match
+ * Simpleminded should be fine 'cos regcomp already checked syntax
+ */
+ ret->nmatch = 1;
+ while (*rxstr) {
+ switch (*rxstr++) {
+ case '\\': /* next char is escaped - skip it */
+ if (*rxstr != 0) {
+ ++rxstr;
+ }
+ break;
+ case '(': /* unescaped bracket implies memory */
+ ++ret->nmatch;
+ break;
+ default:
+ break;
+ }
+ }
+ ret->pmatch = apr_palloc(pool, ret->nmatch*sizeof(ap_regmatch_t));
+ }
+ return ret;
+}
+
+AP_DECLARE(int) ap_rxplus_exec(apr_pool_t *pool, ap_rxplus_t *rx,
+ const char *pattern, char **newpattern)
+ //int max_iterations)
+{
+#if 1
+ int ret = 1;
+ int startl, oldl, newl, diffsz;
+ const char *remainder;
+ char *subs;
+/* snrf process_regexp from mod_headers */
+ if (ap_regexec(&rx->rx, pattern, rx->nmatch, rx->pmatch, rx->flags) != 0) {
+ rx->match = NULL;
+ return 0; /* no match, nothing to do */
+ }
+ rx->match = pattern;
+ if (rx->subs) {
+ *newpattern = ap_pregsub(pool, rx->subs, pattern,
+ rx->nmatch, rx->pmatch);
+ if (!*newpattern) {
+ return 0; /* FIXME - should we do more to handle error? */
+ }
+ startl = rx->pmatch[0].rm_so;
+ oldl = rx->pmatch[0].rm_eo - startl;
+ newl = strlen(*newpattern);
+ diffsz = newl - oldl;
+ remainder = pattern + startl + oldl;
+ if (rx->flags & AP_REG_MULTI) {
+ /* recurse to do any further matches */
+ char *subs;
+ ret += ap_rxplus_exec(pool, rx, remainder, &subs);
+ if (ret > 1) {
+ /* a further substitution happened */
+ diffsz += strlen(subs) - strlen(remainder);
+ remainder = subs;
+ }
+ }
+ subs = apr_palloc(pool, strlen(pattern) + 1 + diffsz);
+ memcpy(subs, pattern, startl);
+ memcpy(subs+startl, *newpattern, newl);
+ strcpy(subs+startl+newl, remainder);
+ *newpattern = subs;
+ }
+ return ret;
+
+
+
+
+#else
+
+
+
+
+
+
+
+
+
+
+
+
+ if (!(rx->flags & AP_REG_MULTI) || (rx->subs == NULL)) {
+ max_iterations = 1;
+ }
+ /* FIXME: multi-matching is incorrect */
+ while (max_iterations-- > 0) {
+ if (ap_regexec(&rx->rx, pattern, rx->nmatch, rx->pmatch, rx->flags)
+ == 0) {
+ ret++;
+ if (rx->subs) {
+ rx->match = pattern;
+ *newpattern = ap_pregsub(pool, rx->subs, pattern,
+ rx->nmatch, rx->pmatch);
+ pattern = *newpattern;
+ if (pattern == NULL) {
+ max_iterations = 0;
+ }
+ }
+ }
+ else {
+ max_iterations = 0;
+ }
+ }
+
+ if (ret == 0 || rx->flags&AP_REG_NOMEM) {
+ rx->match = NULL; /* no match, so don't pretend to remember a match */
+ }
+ else {
+#if 0
+ /* FIXME - should we be 'safe' and take the performance hit,
+ * or just document thou-shalt-keep-pattern-in-scope?
+ */
+ if (rx->match == inpattern) {
+ rx->match = apr_pstrdup(pool, inpattern);
+ }
+#endif
+ }
+ return ret;
+#endif
+}
+#ifdef DOXYGEN
+AP_DECLARE(int) ap_rxplus_nmatch(ap_rxplus_t *rx)
+{
+ return (rx->match != NULL) ? rx->nmatch : 0;
+}
+#endif
+
+/* If this blows up on you, see the notes in the header/apidoc
+ * rx->match is a pointer and it's your responsibility to ensure
+ * it hasn't gone out-of-scope since the last ap_rxplus_exec
+ */
+AP_DECLARE(void) ap_rxplus_match(ap_rxplus_t *rx, int n, int *len,
+ const char **match)
+{
+ if (n >= 0 && n < ap_rxplus_nmatch(rx)) {
+ *match = rx->match + rx->pmatch[n].rm_so;
+ *len = rx->pmatch[n].rm_eo - rx->pmatch[n].rm_so;
+ }
+ else {
+ *len = -1;
+ *match = NULL;
+ }
+}
+AP_DECLARE(char*) ap_rxplus_pmatch(apr_pool_t *pool, ap_rxplus_t *rx, int n)
+{
+ int len;
+ const char *match;
+ ap_rxplus_match(rx, n, &len, &match);
+ return (match != NULL) ? apr_pstrndup(pool, match, len) : NULL;
+}