summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Kew <niq@apache.org>2014-06-01 22:05:44 +0200
committerNick Kew <niq@apache.org>2014-06-01 22:05:44 +0200
commitf0879f75422c1b0ed19066de0c43f304baec82c2 (patch)
treebb44faaf7a53eece8d6464227d379ce52e422ecd
parentSyntax highlight (diff)
downloadapache2-f0879f75422c1b0ed19066de0c43f304baec82c2.tar.xz
apache2-f0879f75422c1b0ed19066de0c43f304baec82c2.zip
mod_proxy_html: support automatic doctype detection.
PR 56285 Patch by Micha Lenk, adapted by niq git-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@1599027 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r--CHANGES3
-rw-r--r--docs/manual/mod/mod_proxy_html.xml11
-rw-r--r--modules/filters/mod_proxy_html.c37
3 files changed, 45 insertions, 6 deletions
diff --git a/CHANGES b/CHANGES
index e377a6e034..dca11889d1 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,6 +1,9 @@
-*- coding: utf-8 -*-
Changes with Apache 2.5.0
+ *) mod_proxy_html: support automatic detection of doctype and processing
+ of FPIs. PR56285 [Micha Lenk <micha lenk info>, Nick Kew]
+
*) mod_proxy_html: skip documents shorter than 4 bytes
PR 56286 [Micha Lenk <micha lenk info>]
diff --git a/docs/manual/mod/mod_proxy_html.xml b/docs/manual/mod/mod_proxy_html.xml
index eeed13e5f9..b752a1796e 100644
--- a/docs/manual/mod/mod_proxy_html.xml
+++ b/docs/manual/mod/mod_proxy_html.xml
@@ -227,7 +227,10 @@ for earlier 2.x versions</compatibility>
<name>ProxyHTMLDocType</name>
<description>Sets an HTML or XHTML document type declaration.</description>
<syntax>ProxyHTMLDocType <var>HTML|XHTML [Legacy]</var><br/><strong>OR</strong>
-<br/>ProxyHTMLDocType <var>fpi [SGML|XML]</var></syntax>
+<br/>ProxyHTMLDocType <var>fpi [SGML|XML]</var><br/><strong>OR</strong>
+<br/>ProxyHTMLDocType <var>html5</var><br/><strong>OR</strong>
+<br/>ProxyHTMLDocType <var>auto</var></syntax>
+<default>ProxyHTMLDocType auto (2.5/trunk versions); no FPI (2.4.x)</default>
<contextlist><context>server config</context>
<context>virtual host</context><context>directory</context>
</contextlist>
@@ -245,9 +248,9 @@ be necessary if you are proxying pre-1998 content or working with defective
authoring/publishing tools.</p>
<p>In the second form, it will insert your own FPI. The optional second
argument determines whether SGML/HTML or XML/XHTML syntax will be used.</p>
-<p>The default is changed to omitting any FPI,
-on the grounds that no FPI is better than a bogus one. If your backend
-generates decent HTML or XHTML, set it accordingly.</p>
+<p>The third form declares documents as HTML 5.</p>
+<p>The fourth form is new in HTTPD trunk and not yet available in released
+versions, and uses libxml2's HTML parser to detect the doctype.</p>
<p>If the first form is used, mod_proxy_html
will also clean up the HTML to the specified standard. It cannot
fix every error, but it will strip out bogus elements and attributes.
diff --git a/modules/filters/mod_proxy_html.c b/modules/filters/mod_proxy_html.c
index 8db7997e36..dcd4b1a5b2 100644
--- a/modules/filters/mod_proxy_html.c
+++ b/modules/filters/mod_proxy_html.c
@@ -108,6 +108,7 @@ typedef struct {
size_t avail;
const char *encoding;
urlmap *map;
+ const char *etag;
} saxctxt;
@@ -280,6 +281,33 @@ static void dump_content(saxctxt *ctx)
}
AP_fwrite(ctx, ctx->buf, strlen(ctx->buf), 1);
}
+static void pinternalSubset(void* ctxt, const xmlChar *name,
+ const xmlChar *externalID, const xmlChar *sysID)
+{
+ saxctxt* ctx = (saxctxt*) ctxt;
+ if (!ctxt || !name) {
+ /* sanity check */
+ return;
+ }
+ if (ctx->cfg->doctype != DEFAULT_DOCTYPE) {
+ /* do nothing if overridden in config */
+ return;
+ }
+ ap_fputstrs(ctx->f->next, ctx->bb, "<!DOCTYPE ", (const char *)name, NULL);
+ if (externalID) {
+ if (!strcasecmp((const char*)name, "html") &&
+ !strncasecmp((const char *)externalID, "-//W3C//DTD XHTML ", 18)) {
+ ctx->etag = xhtml_etag;
+ }
+ else {
+ ctx->etag = html_etag;
+ }
+ ap_fputstrs(ctx->f->next, ctx->bb, " PUBLIC \"", (const char *)externalID, "\"", NULL);
+ if (sysID)
+ ap_fputstrs(ctx->f->next, ctx->bb, " \"", (const char *)sysID, "\"", NULL);
+ }
+ ap_fputs(ctx->f->next, ctx->bb, ">\n");
+}
static void pcdata(void *ctxt, const xmlChar *uchars, int length)
{
const char *chars = (const char*) uchars;
@@ -632,7 +660,7 @@ static void pstartElement(void *ctxt, const xmlChar *uname,
}
ctx->offset = 0;
if (desc && desc->empty)
- ap_fputs(ctx->f->next, ctx->bb, ctx->cfg->etag);
+ ap_fputs(ctx->f->next, ctx->bb, ctx->etag);
else
ap_fputc(ctx->f->next, ctx->bb, '>');
@@ -837,6 +865,7 @@ static saxctxt *check_filter_init (ap_filter_t *f)
fctx->bb = apr_brigade_create(f->r->pool,
f->r->connection->bucket_alloc);
fctx->cfg = cfg;
+ fctx->etag = cfg->etag;
apr_table_unset(f->r->headers_out, "Content-Length");
if (cfg->interp)
@@ -1129,7 +1158,10 @@ static const char *set_doctype(cmd_parms *cmd, void *CFG,
const char *t, const char *l)
{
proxy_html_conf *cfg = (proxy_html_conf *)CFG;
- if (!strcasecmp(t, "xhtml")) {
+ if (!strcasecmp(t, "auto")) {
+ cfg->doctype = DEFAULT_DOCTYPE; /* activates pinternalSubset */
+ }
+ else if (!strcasecmp(t, "xhtml")) {
cfg->etag = xhtml_etag;
if (l && !strcasecmp(l, "legacy"))
cfg->doctype = fpi_xhtml_legacy;
@@ -1249,6 +1281,7 @@ static int mod_proxy_html(apr_pool_t *p, apr_pool_t *p1, apr_pool_t *p2)
sax.characters = pcharacters;
sax.comment = pcomment;
sax.cdataBlock = pcdata;
+ sax.internalSubset = pinternalSubset;
xml2enc_charset = APR_RETRIEVE_OPTIONAL_FN(xml2enc_charset);
xml2enc_filter = APR_RETRIEVE_OPTIONAL_FN(xml2enc_filter);
if (!xml2enc_charset) {