diff options
author | Nick Kew <niq@apache.org> | 2010-09-21 20:42:20 +0200 |
---|---|---|
committer | Nick Kew <niq@apache.org> | 2010-09-21 20:42:20 +0200 |
commit | d4963eadb1091258de4e18b3ac56fe67a3c6d7aa (patch) | |
tree | e70d1c85f13801b0067d70fdf353e37a9d085d44 | |
parent | Reformat these <pre> tables as actual HTML tables (diff) | |
download | apache2-d4963eadb1091258de4e18b3ac56fe67a3c6d7aa.tar.xz apache2-d4963eadb1091258de4e18b3ac56fe67a3c6d7aa.zip |
Introduce ap_rxplus class: higher-level regexps supporting perl-style
regexp operations.
git-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@999533 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r-- | CHANGES | 4 | ||||
-rw-r--r-- | include/ap_regex.h | 80 | ||||
-rw-r--r-- | server/Makefile.in | 2 | ||||
-rw-r--r-- | server/util_pcre.c | 1 | ||||
-rw-r--r-- | server/util_regex.c | 261 |
5 files changed, 346 insertions, 2 deletions
@@ -64,6 +64,10 @@ Changes with Apache 2.3.9 and sub-directories of matched directories are no longer implicitly matched. PR49809 [Eric Covener] + *) Regexps: introduce new higher-level regexp utility including parsing + and executing perl-style regexp ops (e.g s/foo/bar/i) and regexp memory + [Nick Kew] + Changes with Apache 2.3.8 *) suexec: Support large log files. PR 45856. [Stefan Fritsch] diff --git a/include/ap_regex.h b/include/ap_regex.h index 91c0f49b31..9882c3fc5f 100644 --- a/include/ap_regex.h +++ b/include/ap_regex.h @@ -63,7 +63,7 @@ POSSIBILITY OF SUCH DAMAGE. extern "C" { #endif -/* Options for ap_regexec: */ +/* Options for ap_regcomp, ap_regexec, and ap_rxplus versions: */ #define AP_REG_ICASE 0x01 /** use a case-insensitive match */ #define AP_REG_NEWLINE 0x02 /** don't match newlines against '.' etc */ @@ -73,6 +73,10 @@ extern "C" { #define AP_REG_EXTENDED (0) /** unused */ #define AP_REG_NOSUB (0) /** unused */ +#define AP_REG_MULTI 0x10 /* perl's /g (needs fixing) */ +#define AP_REG_NOMEM 0x20 /* nomem in our code */ +#define AP_REG_DOTALL 0x40 /* perl's /s flag */ + /* Error values: */ enum { AP_REG_ASSERT = 1, /** internal error ? */ @@ -134,6 +138,80 @@ AP_DECLARE(apr_size_t) ap_regerror(int errcode, const ap_regex_t *preg, */ AP_DECLARE(void) ap_regfree(ap_regex_t *preg); +/* ap_rxplus: higher-level regexps */ + +typedef struct { + ap_regex_t rx; + apr_uint32_t flags; + const char *subs; + const char *match; + apr_size_t nmatch; + ap_regmatch_t *pmatch; +} ap_rxplus_t; + +/** + * Compile a pattern into a regexp. + * supports perl-like formats + * match-string + * /match-string/flags + * s/match-string/replacement-string/flags + * Intended to support more perl-like stuff as and when round tuits happen + * match-string is anything supported by ap_regcomp + * replacement-string is a substitution string as supported in ap_pregsub + * flags should correspond with perl syntax: treat failure to do so as a bug + * (documentation TBD) + * @param pool Pool to allocate from + * @param pattern Pattern to compile + * @return Compiled regexp, or NULL in case of compile/syntax error + */ +AP_DECLARE(ap_rxplus_t*) ap_rxplus_compile(apr_pool_t *pool, const char *pattern); +/** + * Apply a regexp operation to a string. + * @param pool Pool to allocate from + * @param rx The regex match to apply + * @param pattern The string to apply it to + * NOTE: This MUST be kept in scope to use regexp memory + * @param newpattern The modified string (ignored if the operation doesn't + * modify the string) + * @return Number of times a match happens. Normally 0 (no match) or 1 + * (match found), but may be greater if a transforming pattern + * is applied with the 'g' flag. + */ +AP_DECLARE(int) ap_rxplus_exec(apr_pool_t *pool, ap_rxplus_t *rx, + const char *pattern, char **newpattern); +#ifdef DOXYGEN +/** + * Number of matches in the regexp operation's memory + * This may be 0 if no match is in memory, or up to nmatch from compilation + * @param rx The regexp + * @return Number of matches in memory + */ +AP_DECLARE(int) ap_rxplus_nmatch(ap_rxplus_t *rx); +#else +#define ap_rxplus_nmatch(rx) (((rx)->match != NULL) ? (rx)->nmatch : 0) +#endif +/** + * Get a pointer to a match from regex memory + * NOTE: this relies on the match pattern from the last call to + * ap_rxplus_exec still being valid (i.e. not freed or out-of-scope) + * @param rx The regexp + * @param n The match number to retrieve (must be between 0 and nmatch) + * @param len Returns the length of the match. + * @param match Returns the match pattern + */ +AP_DECLARE(void) ap_rxplus_match(ap_rxplus_t *rx, int n, int *len, + const char **match); +/** + * Get a match from regex memory in a string copy + * NOTE: this relies on the match pattern from the last call to + * ap_rxplus_exec still being valid (i.e. not freed or out-of-scope) + * @param pool Pool to allocate from + * @param rx The regexp + * @param n The match number to retrieve (must be between 0 and nmatch) + * @return The matched string + */ +AP_DECLARE(char*) ap_rxplus_pmatch(apr_pool_t *pool, ap_rxplus_t *rx, int n); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/server/Makefile.in b/server/Makefile.in index 17e5d13995..10b41446a5 100644 --- a/server/Makefile.in +++ b/server/Makefile.in @@ -12,7 +12,7 @@ LTLIBRARY_SOURCES = \ util_script.c util_md5.c util_cfgtree.c util_ebcdic.c util_time.c \ connection.c listen.c util_mutex.c mpm_common.c mpm_unix.c \ util_charset.c util_cookies.c util_debug.c util_xml.c \ - util_expr.c util_filter.c util_pcre.c exports.c \ + util_expr.c util_filter.c util_pcre.c util_regex.c exports.c \ scoreboard.c error_bucket.c protocol.c core.c request.c provider.c \ eoc_bucket.c eor_bucket.c core_filters.c diff --git a/server/util_pcre.c b/server/util_pcre.c index 089c6d360b..8309d5a4a8 100644 --- a/server/util_pcre.c +++ b/server/util_pcre.c @@ -128,6 +128,7 @@ int options = 0; if ((cflags & AP_REG_ICASE) != 0) options |= PCRE_CASELESS; if ((cflags & AP_REG_NEWLINE) != 0) options |= PCRE_MULTILINE; +if ((cflags & AP_REG_DOTALL) != 0) options |= PCRE_DOTALL; preg->re_pcre = pcre_compile(pattern, options, &errorptr, &erroffset, NULL); preg->re_erroffset = erroffset; diff --git a/server/util_regex.c b/server/util_regex.c new file mode 100644 index 0000000000..458e4f6f14 --- /dev/null +++ b/server/util_regex.c @@ -0,0 +1,261 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "apr.h" +#include "apr_lib.h" +#include "apr_pools.h" +#include "apr_strings.h" +#include "ap_config.h" +#include "ap_regex.h" +#include "httpd.h" + +AP_DECLARE(ap_rxplus_t*) ap_rxplus_compile(apr_pool_t *pool, + const char *pattern) +{ + /* perl style patterns + * add support for more as and when wanted + * substitute: s/rx/subs/ + * match: m/rx/ or just /rx/ + */ + + /* allow any nonalnum delimiter as first or second char. + * If we ever use this with non-string pattern we'll need an extra check + */ + const char *endp = 0; + const char *str = pattern; + const char *rxstr; + ap_rxplus_t *ret = apr_pcalloc(pool, sizeof(ap_rxplus_t)); + char delim = 0; + enum { SUBSTITUTE = 's', MATCH = 'm'} action = MATCH; + if (!apr_isalnum(pattern[0])) { + delim = *str++; + } + else if (pattern[0] == 's' && !apr_isalnum(pattern[1])) { + action = SUBSTITUTE; + delim = pattern[1]; + str += 2; + } + else if (pattern[0] == 'm' && !apr_isalnum(pattern[1])) { + delim = pattern[1]; + str += 2; + } + /* TODO: support perl's after/before */ + /* FIXME: fix these simplminded delims */ + + /* we think there's a delimiter. Allow for it not to be if unmatched */ + if (delim) { + endp = ap_strchr_c(str, delim); + } + if (!endp) { /* there's no delim or flags */ + if (ap_regcomp(&ret->rx, pattern, 0) == 0) { + apr_pool_cleanup_register(pool, &ret->rx, (void*) ap_regfree, + apr_pool_cleanup_null); + return ret; + } + else { + return NULL; + } + } + + /* We have a delimiter. Use it to extract the regexp */ + rxstr = apr_pstrndup(pool, str, endp-str); + + /* If it's a substitution, we need the replacement string + * TODO: possible future enhancement - support other parsing + * in the replacement string. + */ + if (action == SUBSTITUTE) { + str = endp+1; + if (!*str || (endp = ap_strchr_c(str, delim), !endp)) { + /* missing replacement string is an error */ + return NULL; + } + ret->subs = apr_pstrndup(pool, str, (endp-str)); + } + + /* anything after the current delimiter is flags */ + while (*++endp) { + switch (*endp) { + case 'i': ret->flags |= AP_REG_ICASE; break; + case 'm': ret->flags |= AP_REG_NEWLINE; break; + case 'n': ret->flags |= AP_REG_NOMEM; break; + case 'g': ret->flags |= AP_REG_MULTI; break; + case 's': ret->flags |= AP_REG_DOTALL; break; + case '^': ret->flags |= AP_REG_NOTBOL; break; + case '$': ret->flags |= AP_REG_NOTEOL; break; + default: break; /* we should probably be stricter here */ + } + } + if (ap_regcomp(&ret->rx, rxstr, ret->flags) == 0) { + apr_pool_cleanup_register(pool, &ret->rx, (void*) ap_regfree, + apr_pool_cleanup_null); + } + else { + return NULL; + } + if (!(ret->flags & AP_REG_NOMEM)) { + /* count size of memory required, starting at 1 for the whole-match + * Simpleminded should be fine 'cos regcomp already checked syntax + */ + ret->nmatch = 1; + while (*rxstr) { + switch (*rxstr++) { + case '\\': /* next char is escaped - skip it */ + if (*rxstr != 0) { + ++rxstr; + } + break; + case '(': /* unescaped bracket implies memory */ + ++ret->nmatch; + break; + default: + break; + } + } + ret->pmatch = apr_palloc(pool, ret->nmatch*sizeof(ap_regmatch_t)); + } + return ret; +} + +AP_DECLARE(int) ap_rxplus_exec(apr_pool_t *pool, ap_rxplus_t *rx, + const char *pattern, char **newpattern) + //int max_iterations) +{ +#if 1 + int ret = 1; + int startl, oldl, newl, diffsz; + const char *remainder; + char *subs; +/* snrf process_regexp from mod_headers */ + if (ap_regexec(&rx->rx, pattern, rx->nmatch, rx->pmatch, rx->flags) != 0) { + rx->match = NULL; + return 0; /* no match, nothing to do */ + } + rx->match = pattern; + if (rx->subs) { + *newpattern = ap_pregsub(pool, rx->subs, pattern, + rx->nmatch, rx->pmatch); + if (!*newpattern) { + return 0; /* FIXME - should we do more to handle error? */ + } + startl = rx->pmatch[0].rm_so; + oldl = rx->pmatch[0].rm_eo - startl; + newl = strlen(*newpattern); + diffsz = newl - oldl; + remainder = pattern + startl + oldl; + if (rx->flags & AP_REG_MULTI) { + /* recurse to do any further matches */ + char *subs; + ret += ap_rxplus_exec(pool, rx, remainder, &subs); + if (ret > 1) { + /* a further substitution happened */ + diffsz += strlen(subs) - strlen(remainder); + remainder = subs; + } + } + subs = apr_palloc(pool, strlen(pattern) + 1 + diffsz); + memcpy(subs, pattern, startl); + memcpy(subs+startl, *newpattern, newl); + strcpy(subs+startl+newl, remainder); + *newpattern = subs; + } + return ret; + + + + +#else + + + + + + + + + + + + + if (!(rx->flags & AP_REG_MULTI) || (rx->subs == NULL)) { + max_iterations = 1; + } + /* FIXME: multi-matching is incorrect */ + while (max_iterations-- > 0) { + if (ap_regexec(&rx->rx, pattern, rx->nmatch, rx->pmatch, rx->flags) + == 0) { + ret++; + if (rx->subs) { + rx->match = pattern; + *newpattern = ap_pregsub(pool, rx->subs, pattern, + rx->nmatch, rx->pmatch); + pattern = *newpattern; + if (pattern == NULL) { + max_iterations = 0; + } + } + } + else { + max_iterations = 0; + } + } + + if (ret == 0 || rx->flags&AP_REG_NOMEM) { + rx->match = NULL; /* no match, so don't pretend to remember a match */ + } + else { +#if 0 + /* FIXME - should we be 'safe' and take the performance hit, + * or just document thou-shalt-keep-pattern-in-scope? + */ + if (rx->match == inpattern) { + rx->match = apr_pstrdup(pool, inpattern); + } +#endif + } + return ret; +#endif +} +#ifdef DOXYGEN +AP_DECLARE(int) ap_rxplus_nmatch(ap_rxplus_t *rx) +{ + return (rx->match != NULL) ? rx->nmatch : 0; +} +#endif + +/* If this blows up on you, see the notes in the header/apidoc + * rx->match is a pointer and it's your responsibility to ensure + * it hasn't gone out-of-scope since the last ap_rxplus_exec + */ +AP_DECLARE(void) ap_rxplus_match(ap_rxplus_t *rx, int n, int *len, + const char **match) +{ + if (n >= 0 && n < ap_rxplus_nmatch(rx)) { + *match = rx->match + rx->pmatch[n].rm_so; + *len = rx->pmatch[n].rm_eo - rx->pmatch[n].rm_so; + } + else { + *len = -1; + *match = NULL; + } +} +AP_DECLARE(char*) ap_rxplus_pmatch(apr_pool_t *pool, ap_rxplus_t *rx, int n) +{ + int len; + const char *match; + ap_rxplus_match(rx, n, &len, &match); + return (match != NULL) ? apr_pstrndup(pool, match, len) : NULL; +} |