diff options
author | Nick Kew <niq@apache.org> | 2014-06-01 22:05:44 +0200 |
---|---|---|
committer | Nick Kew <niq@apache.org> | 2014-06-01 22:05:44 +0200 |
commit | f0879f75422c1b0ed19066de0c43f304baec82c2 (patch) | |
tree | bb44faaf7a53eece8d6464227d379ce52e422ecd | |
parent | Syntax highlight (diff) | |
download | apache2-f0879f75422c1b0ed19066de0c43f304baec82c2.tar.xz apache2-f0879f75422c1b0ed19066de0c43f304baec82c2.zip |
mod_proxy_html: support automatic doctype detection.
PR 56285
Patch by Micha Lenk, adapted by niq
git-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@1599027 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r-- | CHANGES | 3 | ||||
-rw-r--r-- | docs/manual/mod/mod_proxy_html.xml | 11 | ||||
-rw-r--r-- | modules/filters/mod_proxy_html.c | 37 |
3 files changed, 45 insertions, 6 deletions
@@ -1,6 +1,9 @@ -*- coding: utf-8 -*- Changes with Apache 2.5.0 + *) mod_proxy_html: support automatic detection of doctype and processing + of FPIs. PR56285 [Micha Lenk <micha lenk info>, Nick Kew] + *) mod_proxy_html: skip documents shorter than 4 bytes PR 56286 [Micha Lenk <micha lenk info>] diff --git a/docs/manual/mod/mod_proxy_html.xml b/docs/manual/mod/mod_proxy_html.xml index eeed13e5f9..b752a1796e 100644 --- a/docs/manual/mod/mod_proxy_html.xml +++ b/docs/manual/mod/mod_proxy_html.xml @@ -227,7 +227,10 @@ for earlier 2.x versions</compatibility> <name>ProxyHTMLDocType</name> <description>Sets an HTML or XHTML document type declaration.</description> <syntax>ProxyHTMLDocType <var>HTML|XHTML [Legacy]</var><br/><strong>OR</strong> -<br/>ProxyHTMLDocType <var>fpi [SGML|XML]</var></syntax> +<br/>ProxyHTMLDocType <var>fpi [SGML|XML]</var><br/><strong>OR</strong> +<br/>ProxyHTMLDocType <var>html5</var><br/><strong>OR</strong> +<br/>ProxyHTMLDocType <var>auto</var></syntax> +<default>ProxyHTMLDocType auto (2.5/trunk versions); no FPI (2.4.x)</default> <contextlist><context>server config</context> <context>virtual host</context><context>directory</context> </contextlist> @@ -245,9 +248,9 @@ be necessary if you are proxying pre-1998 content or working with defective authoring/publishing tools.</p> <p>In the second form, it will insert your own FPI. The optional second argument determines whether SGML/HTML or XML/XHTML syntax will be used.</p> -<p>The default is changed to omitting any FPI, -on the grounds that no FPI is better than a bogus one. If your backend -generates decent HTML or XHTML, set it accordingly.</p> +<p>The third form declares documents as HTML 5.</p> +<p>The fourth form is new in HTTPD trunk and not yet available in released +versions, and uses libxml2's HTML parser to detect the doctype.</p> <p>If the first form is used, mod_proxy_html will also clean up the HTML to the specified standard. It cannot fix every error, but it will strip out bogus elements and attributes. diff --git a/modules/filters/mod_proxy_html.c b/modules/filters/mod_proxy_html.c index 8db7997e36..dcd4b1a5b2 100644 --- a/modules/filters/mod_proxy_html.c +++ b/modules/filters/mod_proxy_html.c @@ -108,6 +108,7 @@ typedef struct { size_t avail; const char *encoding; urlmap *map; + const char *etag; } saxctxt; @@ -280,6 +281,33 @@ static void dump_content(saxctxt *ctx) } AP_fwrite(ctx, ctx->buf, strlen(ctx->buf), 1); } +static void pinternalSubset(void* ctxt, const xmlChar *name, + const xmlChar *externalID, const xmlChar *sysID) +{ + saxctxt* ctx = (saxctxt*) ctxt; + if (!ctxt || !name) { + /* sanity check */ + return; + } + if (ctx->cfg->doctype != DEFAULT_DOCTYPE) { + /* do nothing if overridden in config */ + return; + } + ap_fputstrs(ctx->f->next, ctx->bb, "<!DOCTYPE ", (const char *)name, NULL); + if (externalID) { + if (!strcasecmp((const char*)name, "html") && + !strncasecmp((const char *)externalID, "-//W3C//DTD XHTML ", 18)) { + ctx->etag = xhtml_etag; + } + else { + ctx->etag = html_etag; + } + ap_fputstrs(ctx->f->next, ctx->bb, " PUBLIC \"", (const char *)externalID, "\"", NULL); + if (sysID) + ap_fputstrs(ctx->f->next, ctx->bb, " \"", (const char *)sysID, "\"", NULL); + } + ap_fputs(ctx->f->next, ctx->bb, ">\n"); +} static void pcdata(void *ctxt, const xmlChar *uchars, int length) { const char *chars = (const char*) uchars; @@ -632,7 +660,7 @@ static void pstartElement(void *ctxt, const xmlChar *uname, } ctx->offset = 0; if (desc && desc->empty) - ap_fputs(ctx->f->next, ctx->bb, ctx->cfg->etag); + ap_fputs(ctx->f->next, ctx->bb, ctx->etag); else ap_fputc(ctx->f->next, ctx->bb, '>'); @@ -837,6 +865,7 @@ static saxctxt *check_filter_init (ap_filter_t *f) fctx->bb = apr_brigade_create(f->r->pool, f->r->connection->bucket_alloc); fctx->cfg = cfg; + fctx->etag = cfg->etag; apr_table_unset(f->r->headers_out, "Content-Length"); if (cfg->interp) @@ -1129,7 +1158,10 @@ static const char *set_doctype(cmd_parms *cmd, void *CFG, const char *t, const char *l) { proxy_html_conf *cfg = (proxy_html_conf *)CFG; - if (!strcasecmp(t, "xhtml")) { + if (!strcasecmp(t, "auto")) { + cfg->doctype = DEFAULT_DOCTYPE; /* activates pinternalSubset */ + } + else if (!strcasecmp(t, "xhtml")) { cfg->etag = xhtml_etag; if (l && !strcasecmp(l, "legacy")) cfg->doctype = fpi_xhtml_legacy; @@ -1249,6 +1281,7 @@ static int mod_proxy_html(apr_pool_t *p, apr_pool_t *p1, apr_pool_t *p2) sax.characters = pcharacters; sax.comment = pcomment; sax.cdataBlock = pcdata; + sax.internalSubset = pinternalSubset; xml2enc_charset = APR_RETRIEVE_OPTIONAL_FN(xml2enc_charset); xml2enc_filter = APR_RETRIEVE_OPTIONAL_FN(xml2enc_filter); if (!xml2enc_charset) { |