package Iotta::HTML::Validate;
use strict;
require Iotta::Elem;
our @ISA = ("Iotta::Elem");
sub filter {
my $self = shift;
my $in = shift;
print STDERR " validating...\n";
if (! &validate ($in)) {
print STDERR "invalid html, exiting\n";
exit (1);
};
return $in;
}
sub run {
my $self = shift;
my $meta = shift;
my $data = shift;
print STDERR " validating...\n";
if (! &validate ($data)) {
print STDERR "invalid html, exiting\n";
exit (1);
}
$self->next->run ($meta, $data) if ($self->next);
}
sub setup {
}
sub initialize {
my $self = shift;
#print STDERR " initializing html validator...\n";
$self->SUPER::initialize (@_);
}
#####################################################################
#
# WDG HTML Validator <http://www.htmlhelp.com/tools/validator/>
# by Liam Quinn <liam@htmlhelp.com>
#
# Copyright (c) 1998-2002 by Liam Quinn
# This program is free software; you can redistribute it
# and/or modify it under the same terms as Perl itself.
#
# Contributors:
# Darxus@ChaosReigns.com
#
#####################################################################
#####################################################################
# Required libraries #
######################
use lib '/usr/share/wdg-html-validator';
use I18N::Charset;
use Unicode::Map8;
use Unicode::String qw(utf8 ucs2);
use POSIX;
use strict;
#####################################################################
sub validate {
my $version = '1.5.2';
# Location of jconv Japanese character encoding converter
my $jconv = '/usr/bin/jconv';
# Location of cjkvconv.pl CJK character encoding converter
my $cjkvconv = '/usr/share/wdg-html-validator/cjkvconv.pl';
# SGML directory (catalog, DTDs, SGML declarations)
my $sgmlDir = '/usr/share/sgml/html/dtd';
# nsgmls command line
# The SGML declaration and name of the temporary file storing the retrieved
# document will be appended to this string
my $nsgmls = '/usr/bin/nsgmls -E0 -s';
# Warnings to pass on command-line to nsgmls, if desired
my $nsgmlsWarnings = '-wnon-sgml-char-ref -wmin-tag';
my $nsgmlsXMLWarnings = '-wxml';
# nsgmls "errors" that are not reported unless warnings are requested.
# These are true errors in XML validation, but they should only be
# reported as warnings otherwise.
my %errorAsWarning = (
' net-enabling start-tag not supported in {{XML}}' => 1,
' unclosed start-tag' => 1,
' unclosed end-tag' => 1
);
# Catalog files for HTML and XHTML
my $htmlCatalog = "/etc/sgml/catalog";
my $xhtmlCatalog = "$sgmlDir/xml/1.0/xhtml.soc";
# Text preceding identification of the document checked
my %documentChecked = (
# English
'en' => 'Document Checked'
);
# Text preceding identification of the character encoding
my %characterEncoding = (
# English
'en' => '<a href="/wdg-html-validator/charset.html">Character encoding</a>:'
);
# Text preceding the level of HTML checked
my %levelOfHTML = (
# English
'en' => '<a href="/wdg-html-validator/doctype.html">Level of HTML</a>:'
);
# Text indicating that only a check for well-formedness was performed
my %wellformednessCheck = (
#English
'en' => 'Checked for <strong><a href="http://www.w3.org/TR/REC-xml#dt-wellformed">well-formedness</a> only</strong> (no <a href="/wdg-html-validator/doctype.html">DOCTYPE</a> found)'
);
# Default DOCTYPE for forgetful users
my $defaultDoctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">';
# Default DOCTYPE if the document contains frames
my $defaultFramesetDoctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"
"http://www.w3.org/TR/html4/frameset.dtd">';
# Error for missing DOCTYPE
my %noDoctype = (
# English error message
'en' => "missing {{document type declaration}}; assuming {{HTML 4.01 Transitional}}"
);
# Error for missing DOCTYPE in a Frameset document
my %noFramesetDoctype = (
# English error message
'en' => "missing {{document type declaration}}; assuming {{HTML 4.01 Frameset}}"
);
# Message if the document is valid
my %noErrors = (
# English
'en' => 'Congratulations, no errors!'
);
# Text to precede an error message
my %preError = (
# English
'en' => 'Error:'
);
# Text to precede a warning message
my %preWarning = (
# English
'en' => 'Warning:'
);
# Heading for errors (not used if warnings are desired)
my %errorsHeading = (
# English
'en' => 'Errors'
);
# Heading for errors and warnings (only used if warnings are desired)
my %errorsAndWarningsHeading = (
# English
'en' => 'Errors and Warnings'
);
# Heading for input listing
my %inputHeading = (
# English
'en' => 'Input'
);
# Text to precede line number
my %lineNumberText = (
# English
'en' => 'Line '
);
# Text to precede character number
my %characterNumberText = (
# English
'en' => 'character '
);
# Mapping from IANA charset name to preferred MIME name
my %MIMECharset = (
'ISO_8859-1:1987' => 'ISO-8859-1',
'ISO_8859-2:1987' => 'ISO-8859-2',
'ISO_8859-3:1988' => 'ISO-8859-3',
'ISO_8859-4:1988' => 'ISO-8859-4',
'ISO_8859-5:1988' => 'ISO-8859-5',
'ISO_8859-6:1987' => 'ISO-8859-6',
'ISO_8859-6-E' => 'ISO-8859-6-e',
'ISO_8859-6-I' => 'ISO-8859-6-i',
'ISO_8859-7:1987' => 'ISO-8859-7',
'ISO_8859-8:1988' => 'ISO-8859-8',
'ISO_8859-8-E' => 'ISO-8859-8-e',
'ISO_8859-8-I' => 'ISO-8859-8-i',
'ISO_8859-9:1989' => 'ISO-8859-9',
'ISO-8859-10' => 'ISO-8859-10',
'iso-8859-13' => 'ISO-8859-13',
'iso-8859-14' => 'ISO-8859-14',
'ISO-8859-15' => 'ISO-8859-15',
'UTF-8' => 'UTF-8',
'ISO-2022-JP' => 'ISO-2022-JP',
'Extended_UNIX_Code_Packed_Format_for_Japanese' => 'EUC-JP',
'EUC-KR' => 'EUC-KR',
'GB2312' => 'GB2312',
'Shift_JIS' => 'Shift_JIS',
'Big5' => 'Big5',
'windows-1250' => 'windows-1250',
'windows-1251' => 'windows-1251',
'windows-1252' => 'windows-1252',
'windows-1253' => 'windows-1253',
'windows-1254' => 'windows-1254',
'windows-1255' => 'windows-1255',
'windows-1256' => 'windows-1256',
'windows-1257' => 'windows-1257',
'windows-1258' => 'windows-1258',
'KOI8-R' => 'KOI8-R',
'KOI8-U' => 'KOI8-U',
'IBM866' => 'cp866',
'cp874' => 'cp874',
'CP874' => 'cp874',
'TIS-620' => 'TIS-620',
'VISCII' => 'VISCII',
'VPS' => 'x-viet-vps',
'TCVN-5712:1993' => 'x-viet-tcvn', # likely IANA name if it's ever registered
'TCVN-5712' => 'x-viet-tcvn', # name returned by I18N::Charset now through Unicode::Map8
'ANSI_X3.4-1968' => 'US-ASCII',
# Some versions of I18N::Charset return incorrect IANA charset names
'ISO-8859-1' => 'ISO-8859-1',
'ISO-8859-1-Windows-3.1-Latin-1' => 'ISO-8859-1',
'ISO-8859-2-Windows-Latin-2' => 'ISO-8859-2',
'ISO-8859-9-Windows-Latin-5' => 'ISO-8859-9'
);
# Mapping from preferred MIME name to name required by nsgmls
my %encodings = (
'US-ASCII' => 'ISO-8859-1',
'ISO-8859-1' => 'ISO-8859-1',
'ISO-8859-2' => 'ISO-8859-2',
'ISO-8859-3' => 'ISO-8859-3',
'ISO-8859-4' => 'ISO-8859-4',
'ISO-8859-5' => 'ISO-8859-5',
'ISO-8859-6' => 'ISO-8859-6',
'ISO-8859-6-e' => 'ISO-8859-6',
'ISO-8859-6-i' => 'ISO-8859-6',
'ISO-8859-7' => 'ISO-8859-7',
'ISO-8859-8' => 'ISO-8859-8',
'ISO-8859-8-e' => 'ISO-8859-8',
'ISO-8859-8-i' => 'ISO-8859-8',
'ISO-8859-9' => 'ISO-8859-9',
'UTF-8' => 'UTF-8',
'EUC-JP' => 'EUC-JP',
'EUC-KR' => 'EUC-KR',
'GB2312' => 'GB2312',
'Big5' => 'Big5',
'Shift_JIS' => 'Shift_JIS',
# The following character encodings will be converted to UTF-8 for
# parsing by nsgmls
'ISO-8859-10' => 'UTF-8',
'ISO-8859-13' => 'UTF-8',
'ISO-8859-14' => 'UTF-8',
'ISO-8859-15' => 'UTF-8',
'windows-1250' => 'UTF-8',
'windows-1251' => 'UTF-8',
'windows-1252' => 'UTF-8',
'windows-1253' => 'UTF-8',
'windows-1254' => 'UTF-8',
'windows-1255' => 'UTF-8',
'windows-1256' => 'UTF-8',
'windows-1257' => 'UTF-8',
'windows-1258' => 'UTF-8',
'KOI8-R' => 'UTF-8',
'KOI8-U' => 'UTF-8',
'cp866' => 'UTF-8',
'cp874' => 'UTF-8',
'TIS-620' => 'UTF-8',
'VISCII' => 'UTF-8',
'x-viet-vps' => 'UTF-8',
'x-viet-tcvn' => 'UTF-8'
);
# Hash table of multibyte character encodings supported
# The value is a regular expression representing a single character
# in the encoding.
my %multibyte = (
'UTF-8' => '[\x00-\x7F]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEF][\x80-\xBF][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]|[\xF1-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xF8[\x88-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]|[\xF9-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xFC[\x84-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xFD[\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]',
'EUC-JP' => '[\x00-\x7F]|[\xA1-\xFE][\xA1-\xFE]|\x8E[\xA0-\xDF]|\x8F[\xA1-\xFE][\xA1-\xFE]',
'EUC-KR' => '[\x00-\x7F]|[\xA1-\xFE][\xA1-\xFE]',
'GB2312' => '[\x00-\x7F]|[\xA1-\xFE][\xA1-\xFE]',
'Big5' => '[\x00-\x7E]|[\xA1-\xFE][\x40-\x7E\xA1-\xFE]',
'Shift_JIS' => '[\x00-\x7F]|[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]|[\xA0-\xDF]'
);
# Hash table of character encodings that must be converted before validation
my %conversionNeeded = (
'ISO-8859-10' => 1,
'ISO-8859-13' => 1,
'ISO-8859-14' => 1,
'ISO-8859-15' => 1,
'windows-1250' => 1,
'windows-1251' => 1,
'windows-1252' => 1,
'windows-1253' => 1,
'windows-1254' => 1,
'windows-1255' => 1,
'windows-1256' => 1,
'windows-1257' => 1,
'windows-1258' => 1,
'KOI8-R' => 1,
'KOI8-U' => 1,
'cp866' => 1,
'cp874' => 1,
'TIS-620' => 1,
'VISCII' => 1,
'x-viet-vps' => 1,
'x-viet-tcvn' => 1
);
# Versions of HTML associated with a given FPI
my %HTMLversion = (
# ADD 'PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN"' => 'XHTML 1.1 plus MathML 2.0',
# ADD 'PUBLIC "-//W3C//DTD MathML 2.0//EN"' => 'MathML 2.0',
# ADD 'PUBLIC "-//W3C//DTD XHTML 1.1//EN"' => 'XHTML 1.1',
'PUBLIC "-//WAPFORUM//DTD WML 1.3//EN"' => 'WML 1.3',
'PUBLIC "-//WAPFORUM//DTD WML 1.2//EN"' => 'WML 1.2',
'PUBLIC "-//WAPFORUM//DTD WML 1.1//EN"' => 'WML 1.1',
'PUBLIC "-//WAPFORUM//DTD WML 1.0//EN"' => 'WML 1.0',
'PUBLIC "-//W3C//DTD XHTML Basic 1.0//EN"' => 'XHTML Basic',
'PUBLIC "ISO/IEC 15445:2000//DTD HyperText Markup Language//EN"' => 'ISO/IEC 15445:2000',
'PUBLIC "ISO/IEC 15445:2000//DTD HTML//EN"' => 'ISO/IEC 15445:2000',
'PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"' => 'XHTML 1.0 Strict',
'PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' => 'XHTML 1.0 Transitional',
'PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"' => 'XHTML 1.0 Frameset',
'PUBLIC "-//W3C//DTD HTML 4.01//EN"' => 'HTML 4.01 Strict',
'PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"' => 'HTML 4.01 Transitional',
'PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"' => 'HTML 4.01 Frameset',
'PUBLIC "-//W3C//DTD HTML 4.0//EN"' => 'HTML 4.0 Strict',
'PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"' => 'HTML 4.0 Transitional',
'PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN"' => 'HTML 4.0 Frameset',
'PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"' => 'HTML 3.2',
'PUBLIC "-//W3C//DTD HTML 3.2 Draft//EN"' => 'HTML 3.2',
'PUBLIC "-//W3C//DTD HTML 3.2//EN"' => 'HTML 3.2',
'PUBLIC "-//W3C//DTD HTML Experimental 970421//EN"' => 'HTML 3.2 + Style',
'PUBLIC "-//W3O//DTD W3 HTML 3.0//EN"' => 'HTML 3.0 Draft',
'PUBLIC "-//IETF//DTD HTML 3.0//EN//"' => 'HTML 3.0 Draft',
'PUBLIC "-//IETF//DTD HTML 3.0//EN"' => 'HTML 3.0 Draft',
'PUBLIC "-//IETF//DTD HTML i18n//EN"' => 'HTML 2.0 + i18n',
'PUBLIC "-//IETF//DTD HTML//EN"' => 'HTML 2.0',
'PUBLIC "-//IETF//DTD HTML 2.0//EN"' => 'HTML 2.0',
'PUBLIC "-//IETF//DTD HTML Level 2//EN"' => 'HTML 2.0',
'PUBLIC "-//IETF//DTD HTML 2.0 Level 2//EN"' => 'HTML 2.0',
'PUBLIC "-//IETF//DTD HTML Level 1//EN"' => 'HTML 2.0 Level 1',
'PUBLIC "-//IETF//DTD HTML 2.0 Level 1//EN"' => 'HTML 2.0 Level 1',
'PUBLIC "-//IETF//DTD HTML Strict//EN"' => 'HTML 2.0 Strict',
'PUBLIC "-//IETF//DTD HTML 2.0 Strict//EN"' => 'HTML 2.0 Strict',
'PUBLIC "-//IETF//DTD HTML Strict Level 2//EN"' => 'HTML 2.0 Strict',
'PUBLIC "-//IETF//DTD HTML 2.0 Strict Level 2//EN"' => 'HTML 2.0 Strict',
'PUBLIC "-//IETF//DTD HTML Strict Level 1//EN"' => 'HTML 2.0 Strict Level 1',
'PUBLIC "-//IETF//DTD HTML 2.0 Strict Level 1//EN"' => 'HTML 2.0 Strict Level 1'
);
# SGML declarations for a given level of HTML
my %sgmlDecl = (
# ADD 'XHTML 1.1 plus MathML 2.0' => "$sgmlDir/xhtml11/xml1n.dcl",
# ADD 'MathML 2.0' => "$sgmlDir/xhtml11/xml1n.dcl",
# ADD 'XHTML 1.1' => "$sgmlDir/xhtml11/xml1n.dcl",
'WML 1.3' => "$sgmlDir/xml/1.0/xhtml1.dcl",
'WML 1.2' => "$sgmlDir/xml/1.0/xhtml1.dcl",
'WML 1.1' => "$sgmlDir/xml/1.0/xhtml1.dcl",
'WML 1.0' => "$sgmlDir/xml/1.0/xhtml1.dcl",
'XHTML Basic' => "$sgmlDir/xml/1.0/xml1.dcl",
'ISO/IEC 15445:2000' => "$sgmlDir/iso-15445/15445.dcl",
'XHTML 1.0 Strict' => "$sgmlDir/xml/1.0/xhtml1.dcl",
'XHTML 1.0 Transitional' => "$sgmlDir/xml/1.0/xhtml1.dcl",
'XHTML 1.0 Frameset' => "$sgmlDir/xml/1.0/xhtml1.dcl",
'HTML 4.01 Strict' => "$sgmlDir/4.01/HTML4.decl",
'HTML 4.01 Transitional' => "$sgmlDir/4.01/HTML4.decl",
'HTML 4.01 Frameset' => "$sgmlDir/4.01/HTML4.decl",
'HTML 4.0 Strict' => "$sgmlDir/4.0/HTML4.decl",
'HTML 4.0 Transitional' => "$sgmlDir/4.0/HTML4.decl",
'HTML 4.0 Frameset' => "$sgmlDir/4.0/HTML4.decl",
'HTML 3.2' => "$sgmlDir/html-32.dcl",
'HTML 3.2 + Style' => "$sgmlDir/html-970421.decl",
'HTML 3.0 Draft' => "$sgmlDir/HTML-3.decl",
'HTML 2.0 + i18n' => "$sgmlDir/html-2-i18n.decl",
'HTML 2.0' => "$sgmlDir/html-2.decl",
'HTML 2.0 Strict' => "$sgmlDir/html-2.decl",
'HTML 2.0 Level 1' => "$sgmlDir/html-2.decl",
'HTML 2.0 Strict Level 1' => "$sgmlDir/html-2.decl",
'Unknown' => "/usr/share/wdg-html-validator/custom.dcl",
'Unknown (XML)' => "$sgmlDir/xml/1.0/xhtml1.dcl"
);
# XHTML DTDs
my %xhtml = (
# ADD 'XHTML 1.1 plus MathML 2.0' => 1,
# ADD 'MathML 2.0' => 1,
# ADD 'XHTML 1.1' => 1,
'WML 1.3' => 1,
'WML 1.2' => 1,
'WML 1.1' => 1,
'WML 1.0' => 1,
'XHTML Basic' => 1,
'XHTML 1.0 Strict' => 1,
'XHTML 1.0 Transitional' => 1,
'XHTML 1.0 Frameset' => 1,
'Unknown (XML)' => 1
);
# Maximum number of extra characters to include in the HTML extract on
# either side of the source of the error
my $extraChars = 30;
my $urlsChecked = 0;
my $document = shift;
my $parser;
my $charset;
my $encodingMsg;
my $xml;
# Determine character encoding of output page
unless ($charset) {
# First check for XML-style charset
# TODO: This check should probably only be done for XHTML documents
if ($document =~ m#^\s*<\?xml\s[^>]*encoding\s*=\s*["']?([^"']+)#iso)
{
$charset = $1;
}
# Check for a META element specifying the character encoding
elsif ($document =~ m#<META(\s[^>]*http\-equiv\s*=\s*["']?Content\-Type["']?[^>]*)>#iso) {
my $metaAttributes = $1;
if ($metaAttributes =~ m#\scontent\s*=\s*["']?.*[\s;]charset\s*=\s*['"]?([^"']+)#iso) {
$charset = $1;
}
}
}
# If we don't know the charset and we're checking XHTML, use UTF-8
# TODO: Support UTF-16
unless ($charset) {
# There is some code repetition here with our DOCTYPE
# checking code later on. I need to think more carefully
# about how best to do DOCTYPE and charset checking
# for XHTML documents.
if ($document =~ /<!DOCTYPE([^>]*)>/iso) {
my $doctypeMeat = $1;
if ($doctypeMeat =~ /PUBLIC\s+("[^"]*")/iso) {
my $htmlLevel = $HTMLversion{"PUBLIC $1"};
if (($htmlLevel && $xhtml{$htmlLevel})
|| (!$htmlLevel && $xml))
{
$charset = 'UTF-8';
}
}
}
}
if ($charset) {
my $enteredCharset = $charset;
# Get IANA name for character encoding
my $charsetName = iana_charset_name($charset);
# Get preferred MIME name
if ($charsetName) {
$charset = $MIMECharset{$charsetName};
} else { # Check for non-IANA charset
$charsetName = map8_charset_name($charset);
if ($charsetName) {
$charset = $MIMECharset{$charsetName};
}
}
if ($charsetName && $charset) {
$encodingMsg = $charset;
# This is a quick hack to add basic support for ISO-2022-JP
if ($charset eq 'ISO-2022-JP') {
# Write document to temporary file for conversion
my ($convTempName, $convTempFH) = getTempFile();
print $convTempFH "$document";
close($convTempFH);
# Convert ISO-2022-JP document to Shift_JIS
open(JCONV, "$jconv -ij -os < $convTempName |")
|| (print STDERR "Server error converting ISO-2022-JP document", return 0);
$document = "";
while (<JCONV>) {
$document .= $_;
}
close(JCONV);
unlink("$convTempName");
$charset = 'Shift_JIS';
}
} else {
$encodingMsg = $enteredCharset
. " (not supported, assuming ISO-8859-1)";
$charset = 'ISO-8859-1';
}
} else {
$encodingMsg = "Unknown (assuming ISO-8859-1)";
$charset = 'ISO-8859-1';
}
my @errors; # queue of errors
my @externalErrors; # queue of errors in an external DTD
# Amount to decrease line count by (i.e., if we add a DOCTYPE)
my $lineAdjust = 0;
my $validatorInput;
my $noValid = 0; # check for validity by default
# Adjust line-endings (nsgmls doesn't recognize Mac newlines)
$document =~ s#\r(?!\n)#\n#go;
# Determine the level of HTML
my $htmlLevel = 'Unknown';
if ($document =~ /<!DOCTYPE([^>]*)>/iso) {
my $doctypeMeat = $1;
if ($doctypeMeat =~ /PUBLIC\s+("[^"]*")/iso) {
$htmlLevel = $HTMLversion{"PUBLIC $1"} || 'Unknown';
}
$validatorInput = $document;
if ($htmlLevel eq 'Unknown' && $xml) {
$htmlLevel = 'Unknown (XML)';
}
} else { # Missing DOCTYPE
# If the document is XML, just check for well-formedness,
# not validity.
if ($xml) {
$htmlLevel = 'Unknown (XML)';
$validatorInput = $document;
$noValid = 1;
} else {
# Add a default DOCTYPE
my ($insertedDoctype, $doctypeError);
if ($document =~ /<FRAMESET/io) {
$insertedDoctype = $defaultFramesetDoctype;
$doctypeError = "missing doctype! (assuming \"HTML 4.01 Frameset\")";
} else {
$insertedDoctype = $defaultDoctype;
$doctypeError = "missing doctype! (assuming \"HTML 4.01 Transitional\")";
}
# Remove byte-order mark from UTF-8 document so we don't have to
# bother slipping the added DOCTYPE after it.
if ($charset eq 'UTF-8') {
$document =~ s/^\xEF\xBB\xBF//o;
}
$validatorInput = "$insertedDoctype" . "\n$document";
# Calculate line adjustment
$lineAdjust = 2;
# Add error message
push(@errors, "::" . (1 + $lineAdjust) . ":0:E: $doctypeError");
}
}
# Determine whether we're dealing with HTML or XHTML
if ($xhtml{$htmlLevel}) {
$ENV{'SGML_CATALOG_FILES'} = $xhtmlCatalog;
} else {
$ENV{'SGML_CATALOG_FILES'} = $htmlCatalog;
}
# Prepare an array of lines in the document for easy access to a given line
my @lines = split(/\n/, $document);
# If necessary, convert to a character encoding (UTF-8) recognized by nsgmls
my $map;
if ($conversionNeeded{$charset}) {
#print STDERR " WARNING: unable to reencode, skipping validation...\n";
#return 1;
$map = Unicode::Map8->new($charset)
|| (print STDERR "Unable to create character encoding map for $charset\n", return 0);
# Pass through invalid characters
$map->nostrict;
# Convert document to UTF-8
$validatorInput = $map->tou($validatorInput)->utf8;
}
# Put the document in a temporary file
my ($tempname, $tempfh) = getTempFile();
print $tempfh "$validatorInput";
close($tempfh);
my $noValidCmd = '';
if ($noValid) {
$noValidCmd = '-wno-valid';
}
my $warningsCmd = '';
if ($xml || $xhtml{$htmlLevel}) {
$warningsCmd = "$nsgmlsXMLWarnings";
} else {
$warningsCmd = "$nsgmlsWarnings";
}
# Run the validator
$ENV{'SP_CHARSET_FIXED'} = 1;
$ENV{'SP_ENCODING'} = $encodings{$charset};
open(NSGMLS, "$nsgmls -b$encodings{$charset} $noValidCmd $warningsCmd $sgmlDecl{$htmlLevel} $tempname 2>&1 |")
|| &error("Server error");
# Create a queue of errors
while (<NSGMLS>) {
chomp;
# Convert character encodings, if necessary
if (defined($map)) {
$_ = $map->to8(utf8($_)->ucs2);
}
my @error = split(/:/, $_, 6);
if ($error[4] eq 'E' || $error[4] eq 'X') {
# With warnings enabled in non-XML validation, some "errors"
# reported by nsgmls are probably better reported as "warnings"
# since they are only reported with warnings enabled.
if (!($xml || $xhtml{$htmlLevel})) {
if ($errorAsWarning{$error[5]}) {
$error[4] = 'W';
# lq-nsgmls uses an XML-specific message for one of
# these warnings. Let's try something more helpful
# for HTML. [This still doesn't seem very good;
# anyone have suggestions for improvement?]
if ($error[5] eq " net-enabling start-tag not supported in {{XML}}") {
$error[5] = " net-enabling start-tag; possibly missing required quotes around an attribute value";
}
$_ = join(':', @error);
}
}
push(@errors, $_);
# If the DOCTYPE is bad, bail out
last if ($error[5] eq " unrecognized {{DOCTYPE}}; unable to check document");
} elsif ($error[4] eq 'W') {
unless ($error[5] eq " characters in the document character set with numbers exceeding 65535 not supported")
{
# Should we separate warnings more explicitly from errors?
# For now let's lump them together.
push(@errors, $_);
}
} elsif ($error[1] =~ /^<URL>/o) { # error from external DTD
push(@externalErrors, $_);
} elsif (length($error[4]) > 1 # Allow secondary messages about preceding error
&& $error[3] ne 'W') # Prevent error about SGML declaration not implied with -wxml
{
push(@errors, $_);
}
}
close(NSGMLS);
# Delete temporary file
unlink $tempname;
# Remove byte-order mark from UTF-8 document in case we output the
# line containing the byte-order mark
if ($charset eq 'UTF-8') {
$lines[0] =~ s/^\xEF\xBB\xBF//o;
}
if ($#errors > -1 || $#externalErrors > -1) {
# Print character encoding information
print STDERR " encoding: $encodingMsg\n";
# Print level of HTML checked
print STDERR " doctype: $htmlLevel\n";
if ($noValid) {
print STDERR " wellformedness check only\n";
}
}
# Report errors
if ($#errors > -1 || $#externalErrors > -1) {
foreach (@externalErrors) {
my @error = split(/:/, $_, 7);
# Determine URL containing the error
my $errorURL;
if ($error[1] =~ /<URL>(.+)/o) {
$errorURL = "$1:$error[2]";
}
my $lineNumber = $error[3];
my $character = $error[4] + 1;
print STDERR " $lineNumber, $character: ";
if ($error[6]) {
print STDERR " " . $error[6];
} else {
print STDERR " " . $error[5];
}
print STDERR "\n";
}
foreach (@errors) {
my @error = split(/:/, $_, 6);
# I don't think this should happen, but I'm not sure
next if $#error < 4;
# Determine line number and character of error
my $lineNumber = $error[2] - $lineAdjust;
next unless $lineNumber > 0;
my $character = $error[3] + 1;
print STDERR " $lineNumber, $character: ";
# Extract relevant section of HTML source.
my ($line, $preMatch, $maxMatch, $spacesToAdd, $extract, $insertedSpaces, $tabcount, $lineLength, $oneChar);
$oneChar = ($multibyte{$charset} || '.');
$line = superChomp($lines[$lineNumber-1]);
$lineLength = ulength($line, $oneChar);
$preMatch = max(0, $character - $extraChars);
$maxMatch = 2 * $extraChars;
($extract) = ($line =~ /
(?:$oneChar)
{$preMatch}
((?:$oneChar)
{1,$maxMatch})/x);
$spacesToAdd = $error[3];
# Expand tabs in the first part of the string to ensure that our character
# pointer lines up correctly
($insertedSpaces, $tabcount) = (0, 0);
if ($extract =~ /\t/o) {
my ($firstPart, $secondPart) =
($extract =~ /^(
(?:$oneChar)
{0,$spacesToAdd})
(.*)$/sx);
($insertedSpaces, $tabcount, $firstPart) = tabExpand($firstPart);
$extract = "$firstPart$secondPart";
$spacesToAdd = $spacesToAdd - $tabcount + $insertedSpaces;
}
if (length($extract) > 0) {
#$extract = "<code class=html>" . $query->escapeHTML($extract) . "</code>";
# Check if the line was truncated for the extract
if ($preMatch > 0) {
$extract = "... $extract";
$spacesToAdd = $extraChars + 3 - $tabcount + $insertedSpaces;
}
if ($preMatch + $maxMatch < ulength($line, $oneChar)) {
$extract = "$extract ...";
}
# Link element names in extract
$extract = linkElements($extract);
print STDERR " $extract\n";
}
# Prepare error message, adding emphasis and links where appropriate
my $errorMsg;
if ($error[5]) {
$errorMsg = superChomp($error[5]);
} else {
$errorMsg = superChomp($error[4]);
}
while ($errorMsg =~ m#\{\{(?:")?(.+?)(?:")?\}\}#gos) {
my $linkText = $1;
my $lcLinkText = lc($linkText);
$errorMsg =~ s#\{\{(")?$linkText(")?\}\}# $1$linkText$2#;
}
# Workaround for the incorrect display of the following error:
# value of attribute "NOWRAP" cannot be ""; must be one of
# "NOWRAP"
$errorMsg =~ s#""#""#go;
$errorMsg =~ s#"(.+?)"#"$1"#g;
if ($error[4] eq 'E' || $error[4] eq 'X') { # Error message
print STDERR " error: ";
} elsif ($error[4] eq 'W') { # warning
print STDERR " warning: ";
}
print STDERR "$errorMsg\n";
}
} else { # no errors
#print STDERR " (no errors found)\n";
return 1;
}
return 0;
}
# Trim leading and trailing whitespace from a string
# Takes a string as the first argument and returns the new string
sub trim {
my $str = shift || return;
$str =~ s/^\s+//go;
$str =~ s/\s+$//go;
return $str;
}
# Encode unsafe characters in a file URL
sub encodeFileURL {
my $url = shift || return;
$url =~ s/ /\%20/go;
$url =~ s/"/\%22/go;
$url =~ s/\#/\%23/go;
return $url;
}
# Return the maximum of two numbers
sub max {
if ($_[0] > $_[1]) {
return $_[0];
} else {
return $_[1];
}
}
# Expand tabs in a string
# Return a list of the number of spaces inserted, the number of
# tabs removed, and the expanded string
# (This is a modified version of Text::Tabs::expand.)
sub tabExpand
{
my @l = @_;
my $tabstop = 8;
my $totalSpacesAdded = 0;
my $totalTabCount = 0;
for $_ (@l) {
my $spacesAdded;
my $tabCount;
while (s/(^|\n)([^\t\n]*)(\t+)/
$1. $2 . (" " x
($spacesAdded = ($tabstop * ($tabCount = length($3))
- (length($2) % $tabstop))))
/gsex)
{
$totalSpacesAdded += $spacesAdded;
$totalTabCount += $tabCount;
}
}
return ($totalSpacesAdded, $totalTabCount, @l);
}
# Link element names in HTML code to the appropriate reference page
# The first argument is the input string
# Returns the string with links inserted
sub linkElements {
my $code = shift || return;
while ($code =~ m#<([^\s&]+)#go) {
my $linkText = $1;
my $lcLinkText = lc($linkText);
}
return $code;
}
# Remove any newline characters (\r or \n) at the end of a string
# First argument is the string
# Returns the new string
sub superChomp {
my $str = shift || return;
$str =~ s/[\r\n]+$//o;
return $str;
}
# Return the number of characters in a potentially-multibyte character string
# First argument is the string
# Second argument is a regular expression denoting a single character
# If the string is single-byte, the second argument should be '.' or omitted
sub ulength {
my $str = shift || return 0;
my $oneChar = (shift || '.');
my $length = 0;
if ($oneChar eq '.') { # single-byte
$length = length($str);
} else { # multibyte
while ($str =~ /$oneChar/gos) {
$length++;
}
}
return $length;
}
# Return true if we like the Content-Type, false otherwise
# First argument must be the Content-Type, minus any parameters
sub checkContentType {
my $type = shift;
return ($type eq 'text/html'
|| $type eq 'text/xml'
|| $type eq 'application/xml'
|| $type eq 'application/xhtml+xml'
|| $type eq 'text/sgml'
|| $type eq 'application/sgml'
|| $type eq 'text/vnd.wap.wml'
|| $type eq 'text/x-wap.wml');
}
# Return true if the Content-Type indicates an XML document, false otherwise
# First argument must be the Content-Type, minus any parameters
sub isXMLType {
my $type = shift;
# XML media types
my %xmlMediaTypes = (
'text/xml' => 1,
'application/xml' => 1,
'application/xhtml+xml' => 1,
'text/vnd.wap.wml' => 1,
'text/x-wap.wml' => 1
);
return ($xmlMediaTypes{$type} || $type =~ /\+xml$/oi);
}
# Create temporary file securely
# Returns the name and file handle of the created file
sub getTempFile {
my $filename;
do {
$filename = POSIX::tmpnam();
} until sysopen(FH, $filename, O_RDWR|O_CREAT|O_EXCL, 0666);
return ($filename, \*FH);
}
1;
Platon Group <platon@platon.sk> http://platon.sk/
|