Platon Technologies
neprihlásený Prihlásiť Registrácia
SlovakEnglish
open source software development oslavujeme 10 rokov vývoja otvoreného softvéru! Štvrtok, 18. apríl 2024

Súbor: [Platon] / iotta / Iotta / HTML / Validate.pm (stiahnutie)

Revízia 1.1.1.1, Sun Feb 16 15:48:11 2003 UTC (21 years, 2 months ago) by yenar


Zmeny od 1.1: +0 -0 [lines]

initial import of iotta into platon cvs

package Iotta::HTML::Validate;
use strict;
require Iotta::Elem;
our @ISA = ("Iotta::Elem");

sub filter {
    my $self = shift;
    my $in = shift;
    print STDERR "  validating...\n";
    if (! &validate ($in)) {
        print STDERR "invalid html, exiting\n";
        exit (1);
    };
    return $in;
}

sub run {
    my $self = shift;
    my $meta = shift;
    my $data = shift;
    print STDERR "  validating...\n";
    if (! &validate ($data)) {
        print STDERR "invalid html, exiting\n";
        exit (1);
    }
    $self->next->run ($meta, $data) if ($self->next);
}

sub setup {
}

sub initialize {
    my $self = shift;
    #print STDERR "  initializing html validator...\n";
    $self->SUPER::initialize (@_);
}

#####################################################################
#
# WDG HTML Validator <http://www.htmlhelp.com/tools/validator/>
# by Liam Quinn <liam@htmlhelp.com>
#
# Copyright (c) 1998-2002 by Liam Quinn
# This program is free software; you can redistribute it
# and/or modify it under the same terms as Perl itself.
#
# Contributors:
#    Darxus@ChaosReigns.com
#
#####################################################################

#####################################################################
# Required libraries #
######################

use lib '/usr/share/wdg-html-validator';
use I18N::Charset;
use Unicode::Map8;
use Unicode::String qw(utf8 ucs2);
use POSIX;
use strict;

#####################################################################


sub validate {
    my $version = '1.5.2';
    # Location of jconv Japanese character encoding converter
    my $jconv = '/usr/bin/jconv';
    # Location of cjkvconv.pl CJK character encoding converter
    my $cjkvconv = '/usr/share/wdg-html-validator/cjkvconv.pl';
    # SGML directory (catalog, DTDs, SGML declarations)
    my $sgmlDir = '/usr/share/sgml/html/dtd';

    # nsgmls command line
    # The SGML declaration and name of the temporary file storing the retrieved
    # document will be appended to this string
    my $nsgmls = '/usr/bin/nsgmls -E0 -s';

    # Warnings to pass on command-line to nsgmls, if desired
    my $nsgmlsWarnings = '-wnon-sgml-char-ref -wmin-tag';
    my $nsgmlsXMLWarnings = '-wxml';

    # nsgmls "errors" that are not reported unless warnings are requested.
    # These are true errors in XML validation, but they should only be
    # reported as warnings otherwise.
    my %errorAsWarning = (
      ' net-enabling start-tag not supported in {{XML}}' => 1,
      ' unclosed start-tag' => 1,
      ' unclosed end-tag' => 1
    );

    # Catalog files for HTML and XHTML
    my $htmlCatalog = "/etc/sgml/catalog";
    my $xhtmlCatalog = "$sgmlDir/xml/1.0/xhtml.soc";

    # Text preceding identification of the document checked
    my %documentChecked = (
      # English
      'en' => 'Document Checked'
    );

    # Text preceding identification of the character encoding
    my %characterEncoding = (
      # English
      'en' => '<a href="/wdg-html-validator/charset.html">Character encoding</a>:'
    );

    # Text preceding the level of HTML checked
    my %levelOfHTML = (
      # English
      'en' => '<a href="/wdg-html-validator/doctype.html">Level of HTML</a>:'
    );

    # Text indicating that only a check for well-formedness was performed
    my %wellformednessCheck = (
      #English
      'en' => 'Checked for <strong><a href="http://www.w3.org/TR/REC-xml#dt-wellformed">well-formedness</a> only</strong> (no <a href="/wdg-html-validator/doctype.html">DOCTYPE</a> found)'
    );

    # Default DOCTYPE for forgetful users
    my $defaultDoctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
       "http://www.w3.org/TR/html4/loose.dtd">';

    # Default DOCTYPE if the document contains frames
    my $defaultFramesetDoctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"
       "http://www.w3.org/TR/html4/frameset.dtd">';

    # Error for missing DOCTYPE
    my %noDoctype = (
      # English error message
      'en' => "missing {{document type declaration}}; assuming {{HTML 4.01 Transitional}}"
    );

    # Error for missing DOCTYPE in a Frameset document
    my %noFramesetDoctype = (
      # English error message
      'en' => "missing {{document type declaration}}; assuming {{HTML 4.01 Frameset}}"
    );

    # Message if the document is valid
    my %noErrors = (
      # English
      'en' => 'Congratulations, no errors!'
    );

    # Text to precede an error message
    my %preError = (
      # English
      'en' => 'Error:'
    );

    # Text to precede a warning message
    my %preWarning = (
      # English
      'en' => 'Warning:'
    );

    # Heading for errors (not used if warnings are desired)
    my %errorsHeading = (
      # English
      'en' => 'Errors'
    );

    # Heading for errors and warnings (only used if warnings are desired)
    my %errorsAndWarningsHeading = (
      # English
      'en' => 'Errors and Warnings'
    );

    # Heading for input listing
    my %inputHeading = (
      # English
      'en' => 'Input'
    );

    # Text to precede line number
    my %lineNumberText = (
      # English
      'en' => 'Line '
    );

    # Text to precede character number
    my %characterNumberText = (
      # English
      'en' => 'character '
    );

    # Mapping from IANA charset name to preferred MIME name
    my %MIMECharset = (
      'ISO_8859-1:1987' => 'ISO-8859-1',
      'ISO_8859-2:1987' => 'ISO-8859-2',
      'ISO_8859-3:1988' => 'ISO-8859-3',
      'ISO_8859-4:1988' => 'ISO-8859-4',
      'ISO_8859-5:1988' => 'ISO-8859-5',
      'ISO_8859-6:1987' => 'ISO-8859-6',
      'ISO_8859-6-E' => 'ISO-8859-6-e',
      'ISO_8859-6-I' => 'ISO-8859-6-i',
      'ISO_8859-7:1987' => 'ISO-8859-7',
      'ISO_8859-8:1988' => 'ISO-8859-8',
      'ISO_8859-8-E' => 'ISO-8859-8-e',
      'ISO_8859-8-I' => 'ISO-8859-8-i',
      'ISO_8859-9:1989' => 'ISO-8859-9',
      'ISO-8859-10' => 'ISO-8859-10',
      'iso-8859-13' => 'ISO-8859-13',
      'iso-8859-14' => 'ISO-8859-14',
      'ISO-8859-15' => 'ISO-8859-15',
      'UTF-8' => 'UTF-8',
      'ISO-2022-JP' => 'ISO-2022-JP',
      'Extended_UNIX_Code_Packed_Format_for_Japanese' => 'EUC-JP',
      'EUC-KR' => 'EUC-KR',
      'GB2312' => 'GB2312',
      'Shift_JIS' => 'Shift_JIS',
      'Big5' => 'Big5',
      'windows-1250' => 'windows-1250',
      'windows-1251' => 'windows-1251',
      'windows-1252' => 'windows-1252',
      'windows-1253' => 'windows-1253',
      'windows-1254' => 'windows-1254',
      'windows-1255' => 'windows-1255',
      'windows-1256' => 'windows-1256',
      'windows-1257' => 'windows-1257',
      'windows-1258' => 'windows-1258',
      'KOI8-R' => 'KOI8-R',
      'KOI8-U' => 'KOI8-U',
      'IBM866' => 'cp866',
      'cp874' => 'cp874',
      'CP874' => 'cp874',
      'TIS-620' => 'TIS-620',
      'VISCII' => 'VISCII',
      'VPS' => 'x-viet-vps',
      'TCVN-5712:1993' => 'x-viet-tcvn', # likely IANA name if it's ever registered
      'TCVN-5712' => 'x-viet-tcvn', # name returned by I18N::Charset now through Unicode::Map8
      'ANSI_X3.4-1968' => 'US-ASCII',

      # Some versions of I18N::Charset return incorrect IANA charset names
      'ISO-8859-1' => 'ISO-8859-1',
      'ISO-8859-1-Windows-3.1-Latin-1' => 'ISO-8859-1',
      'ISO-8859-2-Windows-Latin-2' => 'ISO-8859-2',
      'ISO-8859-9-Windows-Latin-5' => 'ISO-8859-9'
    );

    # Mapping from preferred MIME name to name required by nsgmls
    my %encodings = (
      'US-ASCII' => 'ISO-8859-1',
      'ISO-8859-1' => 'ISO-8859-1',
      'ISO-8859-2' => 'ISO-8859-2',
      'ISO-8859-3' => 'ISO-8859-3',
      'ISO-8859-4' => 'ISO-8859-4',
      'ISO-8859-5' => 'ISO-8859-5',
      'ISO-8859-6' => 'ISO-8859-6',
      'ISO-8859-6-e' => 'ISO-8859-6',
      'ISO-8859-6-i' => 'ISO-8859-6',
      'ISO-8859-7' => 'ISO-8859-7',
      'ISO-8859-8' => 'ISO-8859-8',
      'ISO-8859-8-e' => 'ISO-8859-8',
      'ISO-8859-8-i' => 'ISO-8859-8',
      'ISO-8859-9' => 'ISO-8859-9',
      'UTF-8' => 'UTF-8',
      'EUC-JP' => 'EUC-JP',
      'EUC-KR' => 'EUC-KR',
      'GB2312' => 'GB2312',
      'Big5' => 'Big5',
      'Shift_JIS' => 'Shift_JIS',

      # The following character encodings will be converted to UTF-8 for
      # parsing by nsgmls
      'ISO-8859-10' => 'UTF-8',
      'ISO-8859-13' => 'UTF-8',
      'ISO-8859-14' => 'UTF-8',
      'ISO-8859-15' => 'UTF-8',
      'windows-1250' => 'UTF-8',
      'windows-1251' => 'UTF-8',
      'windows-1252' => 'UTF-8',
      'windows-1253' => 'UTF-8',
      'windows-1254' => 'UTF-8',
      'windows-1255' => 'UTF-8',
      'windows-1256' => 'UTF-8',
      'windows-1257' => 'UTF-8',
      'windows-1258' => 'UTF-8',
      'KOI8-R' => 'UTF-8',
      'KOI8-U' => 'UTF-8',
      'cp866' => 'UTF-8',
      'cp874' => 'UTF-8',
      'TIS-620' => 'UTF-8',
      'VISCII' => 'UTF-8',
      'x-viet-vps' => 'UTF-8',
      'x-viet-tcvn' => 'UTF-8'
    );


    # Hash table of multibyte character encodings supported
    # The value is a regular expression representing a single character
    # in the encoding.
    my %multibyte = (
      'UTF-8' => '[\x00-\x7F]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEF][\x80-\xBF][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]|[\xF1-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xF8[\x88-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]|[\xF9-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xFC[\x84-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xFD[\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]',
      'EUC-JP' => '[\x00-\x7F]|[\xA1-\xFE][\xA1-\xFE]|\x8E[\xA0-\xDF]|\x8F[\xA1-\xFE][\xA1-\xFE]',
      'EUC-KR' => '[\x00-\x7F]|[\xA1-\xFE][\xA1-\xFE]',
      'GB2312' => '[\x00-\x7F]|[\xA1-\xFE][\xA1-\xFE]',
      'Big5' => '[\x00-\x7E]|[\xA1-\xFE][\x40-\x7E\xA1-\xFE]',
      'Shift_JIS' => '[\x00-\x7F]|[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]|[\xA0-\xDF]'
    );


    # Hash table of character encodings that must be converted before validation
    my %conversionNeeded = (
      'ISO-8859-10' => 1,
      'ISO-8859-13' => 1,
      'ISO-8859-14' => 1,
      'ISO-8859-15' => 1,
      'windows-1250' => 1,
      'windows-1251' => 1,
      'windows-1252' => 1,
      'windows-1253' => 1,
      'windows-1254' => 1,
      'windows-1255' => 1,
      'windows-1256' => 1,
      'windows-1257' => 1,
      'windows-1258' => 1,
      'KOI8-R' => 1,
      'KOI8-U' => 1,
      'cp866' => 1,
      'cp874' => 1,
      'TIS-620' => 1,
      'VISCII' => 1,
      'x-viet-vps' => 1,
      'x-viet-tcvn' => 1
    );

    # Versions of HTML associated with a given FPI
    my %HTMLversion = (
    # ADD  'PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN"' => 'XHTML 1.1 plus MathML 2.0',
    # ADD  'PUBLIC "-//W3C//DTD MathML 2.0//EN"' => 'MathML 2.0',
    # ADD  'PUBLIC "-//W3C//DTD XHTML 1.1//EN"' => 'XHTML 1.1',
      'PUBLIC "-//WAPFORUM//DTD WML 1.3//EN"' => 'WML 1.3',
      'PUBLIC "-//WAPFORUM//DTD WML 1.2//EN"' => 'WML 1.2',
      'PUBLIC "-//WAPFORUM//DTD WML 1.1//EN"' => 'WML 1.1',
      'PUBLIC "-//WAPFORUM//DTD WML 1.0//EN"' => 'WML 1.0',
      'PUBLIC "-//W3C//DTD XHTML Basic 1.0//EN"' => 'XHTML Basic',
      'PUBLIC "ISO/IEC 15445:2000//DTD HyperText Markup Language//EN"' => 'ISO/IEC 15445:2000',
      'PUBLIC "ISO/IEC 15445:2000//DTD HTML//EN"' => 'ISO/IEC 15445:2000',
      'PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"' => 'XHTML 1.0 Strict',
      'PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' => 'XHTML 1.0 Transitional',
      'PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"' => 'XHTML 1.0 Frameset',
      'PUBLIC "-//W3C//DTD HTML 4.01//EN"' => 'HTML 4.01 Strict',
      'PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"' => 'HTML 4.01 Transitional',
      'PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"' => 'HTML 4.01 Frameset',
      'PUBLIC "-//W3C//DTD HTML 4.0//EN"' => 'HTML 4.0 Strict',
      'PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"' => 'HTML 4.0 Transitional',
      'PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN"' => 'HTML 4.0 Frameset',
      'PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"' => 'HTML 3.2',
      'PUBLIC "-//W3C//DTD HTML 3.2 Draft//EN"' => 'HTML 3.2',
      'PUBLIC "-//W3C//DTD HTML 3.2//EN"' => 'HTML 3.2',
      'PUBLIC "-//W3C//DTD HTML Experimental 970421//EN"' => 'HTML 3.2 + Style',
      'PUBLIC "-//W3O//DTD W3 HTML 3.0//EN"' => 'HTML 3.0 Draft',
      'PUBLIC "-//IETF//DTD HTML 3.0//EN//"' => 'HTML 3.0 Draft',
      'PUBLIC "-//IETF//DTD HTML 3.0//EN"' => 'HTML 3.0 Draft',
      'PUBLIC "-//IETF//DTD HTML i18n//EN"' => 'HTML 2.0 + i18n',
      'PUBLIC "-//IETF//DTD HTML//EN"' => 'HTML 2.0',
      'PUBLIC "-//IETF//DTD HTML 2.0//EN"' => 'HTML 2.0',
      'PUBLIC "-//IETF//DTD HTML Level 2//EN"' => 'HTML 2.0',
      'PUBLIC "-//IETF//DTD HTML 2.0 Level 2//EN"' => 'HTML 2.0',
      'PUBLIC "-//IETF//DTD HTML Level 1//EN"' => 'HTML 2.0 Level 1',
      'PUBLIC "-//IETF//DTD HTML 2.0 Level 1//EN"' => 'HTML 2.0 Level 1',
      'PUBLIC "-//IETF//DTD HTML Strict//EN"' => 'HTML 2.0 Strict',
      'PUBLIC "-//IETF//DTD HTML 2.0 Strict//EN"' => 'HTML 2.0 Strict',
      'PUBLIC "-//IETF//DTD HTML Strict Level 2//EN"' => 'HTML 2.0 Strict',
      'PUBLIC "-//IETF//DTD HTML 2.0 Strict Level 2//EN"' => 'HTML 2.0 Strict',
      'PUBLIC "-//IETF//DTD HTML Strict Level 1//EN"' => 'HTML 2.0 Strict Level 1',
      'PUBLIC "-//IETF//DTD HTML 2.0 Strict Level 1//EN"' => 'HTML 2.0 Strict Level 1'
    );

    # SGML declarations for a given level of HTML
    my %sgmlDecl = (
    # ADD  'XHTML 1.1 plus MathML 2.0' => "$sgmlDir/xhtml11/xml1n.dcl",
    # ADD  'MathML 2.0' => "$sgmlDir/xhtml11/xml1n.dcl",
    # ADD  'XHTML 1.1' => "$sgmlDir/xhtml11/xml1n.dcl",
      'WML 1.3' => "$sgmlDir/xml/1.0/xhtml1.dcl",
      'WML 1.2' => "$sgmlDir/xml/1.0/xhtml1.dcl",
      'WML 1.1' => "$sgmlDir/xml/1.0/xhtml1.dcl",
      'WML 1.0' => "$sgmlDir/xml/1.0/xhtml1.dcl",
      'XHTML Basic' => "$sgmlDir/xml/1.0/xml1.dcl",
      'ISO/IEC 15445:2000' => "$sgmlDir/iso-15445/15445.dcl",
      'XHTML 1.0 Strict' => "$sgmlDir/xml/1.0/xhtml1.dcl",
      'XHTML 1.0 Transitional' => "$sgmlDir/xml/1.0/xhtml1.dcl",
      'XHTML 1.0 Frameset' => "$sgmlDir/xml/1.0/xhtml1.dcl",
      'HTML 4.01 Strict' => "$sgmlDir/4.01/HTML4.decl",
      'HTML 4.01 Transitional' => "$sgmlDir/4.01/HTML4.decl",
      'HTML 4.01 Frameset' => "$sgmlDir/4.01/HTML4.decl",
      'HTML 4.0 Strict' => "$sgmlDir/4.0/HTML4.decl",
      'HTML 4.0 Transitional' => "$sgmlDir/4.0/HTML4.decl",
      'HTML 4.0 Frameset' => "$sgmlDir/4.0/HTML4.decl",
      'HTML 3.2' => "$sgmlDir/html-32.dcl",
      'HTML 3.2 + Style' => "$sgmlDir/html-970421.decl",
      'HTML 3.0 Draft' => "$sgmlDir/HTML-3.decl",
      'HTML 2.0 + i18n' => "$sgmlDir/html-2-i18n.decl",
      'HTML 2.0' => "$sgmlDir/html-2.decl",
      'HTML 2.0 Strict' => "$sgmlDir/html-2.decl",
      'HTML 2.0 Level 1' => "$sgmlDir/html-2.decl",
      'HTML 2.0 Strict Level 1' => "$sgmlDir/html-2.decl",
      'Unknown' => "/usr/share/wdg-html-validator/custom.dcl",
      'Unknown (XML)' => "$sgmlDir/xml/1.0/xhtml1.dcl"
    );

    # XHTML DTDs
    my %xhtml = (
    # ADD  'XHTML 1.1 plus MathML 2.0' => 1,
    # ADD  'MathML 2.0' => 1,
    # ADD  'XHTML 1.1' => 1,
      'WML 1.3' => 1,
      'WML 1.2' => 1,
      'WML 1.1' => 1,
      'WML 1.0' => 1,
      'XHTML Basic' => 1,
      'XHTML 1.0 Strict' => 1,
      'XHTML 1.0 Transitional' => 1,
      'XHTML 1.0 Frameset' => 1,
      'Unknown (XML)' => 1
    );

    # Maximum number of extra characters to include in the HTML extract on
    # either side of the source of the error
    my $extraChars = 30;

    my $urlsChecked = 0;
    my $document = shift;
    my $parser;
    my $charset;
    my $encodingMsg;
    my $xml;
    # Determine character encoding of output page

    unless ($charset) {
        # First check for XML-style charset
        # TODO: This check should probably only be done for XHTML documents
        if ($document =~ m#^\s*<\?xml\s[^>]*encoding\s*=\s*["']?([^"']+)#iso)
        {
            $charset = $1;
        }

        # Check for a META element specifying the character encoding
        elsif ($document =~ m#<META(\s[^>]*http\-equiv\s*=\s*["']?Content\-Type["']?[^>]*)>#iso) {
            my $metaAttributes = $1;
            if ($metaAttributes =~ m#\scontent\s*=\s*["']?.*[\s;]charset\s*=\s*['"]?([^"']+)#iso) {
                $charset = $1;
            }
        }
    }

    # If we don't know the charset and we're checking XHTML, use UTF-8
    # TODO: Support UTF-16
    unless ($charset) {
        # There is some code repetition here with our DOCTYPE
        # checking code later on.  I need to think more carefully
        # about how best to do DOCTYPE and charset checking
        # for XHTML documents.
        if ($document =~ /<!DOCTYPE([^>]*)>/iso) {

            my $doctypeMeat = $1;
            if ($doctypeMeat =~ /PUBLIC\s+("[^"]*")/iso) {
                my $htmlLevel = $HTMLversion{"PUBLIC $1"};
                if (($htmlLevel && $xhtml{$htmlLevel})
                    || (!$htmlLevel && $xml))
                {
                    $charset = 'UTF-8';
                }
            }
        }
    }

    if ($charset) {

        my $enteredCharset = $charset;

        # Get IANA name for character encoding
        my $charsetName = iana_charset_name($charset);

        # Get preferred MIME name
        if ($charsetName) {
            $charset = $MIMECharset{$charsetName};
        } else { # Check for non-IANA charset
            $charsetName = map8_charset_name($charset);
            if ($charsetName) {
                $charset = $MIMECharset{$charsetName};
            }
        }

        if ($charsetName && $charset) {
            $encodingMsg = $charset;

            # This is a quick hack to add basic support for ISO-2022-JP
            if ($charset eq 'ISO-2022-JP') {

                # Write document to temporary file for conversion
                my ($convTempName, $convTempFH) = getTempFile();
                print $convTempFH "$document";
                close($convTempFH);

                # Convert ISO-2022-JP document to Shift_JIS
                open(JCONV, "$jconv -ij -os < $convTempName |")
                  || (print STDERR "Server error converting ISO-2022-JP document", return 0);

                $document = "";
                while (<JCONV>) {
                    $document .= $_;
                }
                close(JCONV);

                unlink("$convTempName");

                $charset = 'Shift_JIS';

            }

        } else {
            $encodingMsg = $enteredCharset
                . " (not supported, assuming ISO-8859-1)";
            $charset = 'ISO-8859-1';
        }

    } else {

        $encodingMsg = "Unknown (assuming ISO-8859-1)";
        $charset = 'ISO-8859-1';

    }

    my @errors; # queue of errors
    my @externalErrors; # queue of errors in an external DTD

    # Amount to decrease line count by (i.e., if we add a DOCTYPE)
    my $lineAdjust = 0;

    my $validatorInput;
    my $noValid = 0; # check for validity by default

    # Adjust line-endings (nsgmls doesn't recognize Mac newlines)
    $document =~ s#\r(?!\n)#\n#go;

    # Determine the level of HTML
    my $htmlLevel = 'Unknown';
    if ($document =~ /<!DOCTYPE([^>]*)>/iso) {

        my $doctypeMeat = $1;
        if ($doctypeMeat =~ /PUBLIC\s+("[^"]*")/iso) {
            $htmlLevel = $HTMLversion{"PUBLIC $1"} || 'Unknown';
        }

        $validatorInput = $document;

        if ($htmlLevel eq 'Unknown' && $xml) {
            $htmlLevel = 'Unknown (XML)';
        }

    } else { # Missing DOCTYPE

        # If the document is XML, just check for well-formedness,
        # not validity.
        if ($xml) {

            $htmlLevel = 'Unknown (XML)';
            $validatorInput = $document;
            $noValid = 1;
            
        } else {

            # Add a default DOCTYPE
            my ($insertedDoctype, $doctypeError);
            if ($document =~ /<FRAMESET/io) {
                $insertedDoctype = $defaultFramesetDoctype;
                $doctypeError = "missing doctype! (assuming \"HTML 4.01 Frameset\")";
            } else {
                $insertedDoctype = $defaultDoctype;
                $doctypeError = "missing doctype! (assuming \"HTML 4.01 Transitional\")";
            }

            # Remove byte-order mark from UTF-8 document so we don't have to
            # bother slipping the added DOCTYPE after it.
            if ($charset eq 'UTF-8') {
                $document =~ s/^\xEF\xBB\xBF//o;
            }

            $validatorInput = "$insertedDoctype" . "\n$document";

            # Calculate line adjustment
            $lineAdjust = 2;

            # Add error message
            push(@errors, "::" . (1 + $lineAdjust) . ":0:E: $doctypeError");

        }
    }

    # Determine whether we're dealing with HTML or XHTML
    if ($xhtml{$htmlLevel}) {
        $ENV{'SGML_CATALOG_FILES'} = $xhtmlCatalog;
    } else {
        $ENV{'SGML_CATALOG_FILES'} = $htmlCatalog;
    }

    # Prepare an array of lines in the document for easy access to a given line
    my @lines = split(/\n/, $document);

    # If necessary, convert to a character encoding (UTF-8) recognized by nsgmls
    my $map;
    if ($conversionNeeded{$charset}) {

        #print STDERR "  WARNING: unable to reencode, skipping validation...\n";
        #return 1;

        $map = Unicode::Map8->new($charset)
            || (print STDERR "Unable to create character encoding map for $charset\n", return 0);

        # Pass through invalid characters
        $map->nostrict;

        # Convert document to UTF-8
        $validatorInput = $map->tou($validatorInput)->utf8;

    }

    # Put the document in a temporary file
    my ($tempname, $tempfh) = getTempFile();
    print $tempfh "$validatorInput";
    close($tempfh);

    my $noValidCmd = '';
    if ($noValid) {
        $noValidCmd = '-wno-valid';
    }

    my $warningsCmd = '';
    if ($xml || $xhtml{$htmlLevel}) {
        $warningsCmd = "$nsgmlsXMLWarnings";
    } else {
        $warningsCmd = "$nsgmlsWarnings";
    }

    # Run the validator
    $ENV{'SP_CHARSET_FIXED'} = 1;
    $ENV{'SP_ENCODING'} = $encodings{$charset};
    open(NSGMLS, "$nsgmls -b$encodings{$charset} $noValidCmd $warningsCmd $sgmlDecl{$htmlLevel} $tempname 2>&1 |")
        || &error("Server error");

    # Create a queue of errors
    while (<NSGMLS>) {
        chomp;

        # Convert character encodings, if necessary
        if (defined($map)) {
            $_ = $map->to8(utf8($_)->ucs2);
        }

        my @error = split(/:/, $_, 6);

        if ($error[4] eq 'E' || $error[4] eq 'X') {

            # With warnings enabled in non-XML validation, some "errors"
            # reported by nsgmls are probably better reported as "warnings"
            # since they are only reported with warnings enabled.
            if (!($xml || $xhtml{$htmlLevel})) {
                if ($errorAsWarning{$error[5]}) {
                    $error[4] = 'W';

                    # lq-nsgmls uses an XML-specific message for one of
                    # these warnings.  Let's try something more helpful
                    # for HTML.  [This still doesn't seem very good;
                    # anyone have suggestions for improvement?]
                    if ($error[5] eq " net-enabling start-tag not supported in {{XML}}") {
                        $error[5] = " net-enabling start-tag; possibly missing required quotes around an attribute value";
                    }

                    $_ = join(':', @error);
                }
            }

            push(@errors, $_);

            # If the DOCTYPE is bad, bail out
            last if ($error[5] eq " unrecognized {{DOCTYPE}}; unable to check document");

        } elsif ($error[4] eq 'W') {

            unless ($error[5] eq " characters in the document character set with numbers exceeding 65535 not supported")
            {
                # Should we separate warnings more explicitly from errors?
                # For now let's lump them together.
                push(@errors, $_);
            }

        } elsif ($error[1] =~ /^<URL>/o) { # error from external DTD

            push(@externalErrors, $_);

        } elsif (length($error[4]) > 1 # Allow secondary messages about preceding error
            && $error[3] ne 'W') # Prevent error about SGML declaration not implied with -wxml
        { 

            push(@errors, $_);

        }

    }
    close(NSGMLS);

    # Delete temporary file
    unlink $tempname;

    # Remove byte-order mark from UTF-8 document in case we output the
    # line containing the byte-order mark
    if ($charset eq 'UTF-8') {
        $lines[0] =~ s/^\xEF\xBB\xBF//o;
    }

    if ($#errors > -1 || $#externalErrors > -1) {
        # Print character encoding information
        print STDERR "  encoding: $encodingMsg\n";

        # Print level of HTML checked
        print STDERR "  doctype: $htmlLevel\n";
        
        if ($noValid) {
            print STDERR "  wellformedness check only\n";
        }
    }

    # Report errors
    if ($#errors > -1 || $#externalErrors > -1) {

        foreach (@externalErrors) {
            my @error = split(/:/, $_, 7);

            # Determine URL containing the error
            my $errorURL;
            if ($error[1] =~ /<URL>(.+)/o) {
                $errorURL = "$1:$error[2]";
            }

            my $lineNumber = $error[3];
            my $character = $error[4] + 1;

            print STDERR "  $lineNumber, $character: ";

            if ($error[6]) {
                print STDERR "  " . $error[6];
            } else {
                print STDERR "  " . $error[5];
            }
            print STDERR "\n";
        }

        foreach (@errors) {
            my @error = split(/:/, $_, 6);

            # I don't think this should happen, but I'm not sure
            next if $#error < 4;

            # Determine line number and character of error
            my $lineNumber = $error[2] - $lineAdjust;
            next unless $lineNumber > 0;
            my $character = $error[3] + 1;

            print STDERR "  $lineNumber, $character: ";

            # Extract relevant section of HTML source.
            my ($line, $preMatch, $maxMatch, $spacesToAdd, $extract, $insertedSpaces, $tabcount, $lineLength, $oneChar);
            $oneChar = ($multibyte{$charset} || '.');

            $line = superChomp($lines[$lineNumber-1]);
            $lineLength = ulength($line, $oneChar);
            $preMatch = max(0, $character - $extraChars);
            $maxMatch = 2 * $extraChars;
            ($extract) = ($line =~ /
                (?:$oneChar)
                {$preMatch}
                ((?:$oneChar)
                {1,$maxMatch})/x);
            $spacesToAdd = $error[3];

            # Expand tabs in the first part of the string to ensure that our character
            # pointer lines up correctly
            ($insertedSpaces, $tabcount) = (0, 0);
            if ($extract =~ /\t/o) {
                my ($firstPart, $secondPart) =
                    ($extract =~ /^(
                    (?:$oneChar)
                    {0,$spacesToAdd})
                    (.*)$/sx);
                ($insertedSpaces, $tabcount, $firstPart) = tabExpand($firstPart);
                $extract = "$firstPart$secondPart";
                $spacesToAdd = $spacesToAdd - $tabcount + $insertedSpaces;
            }

            if (length($extract) > 0) {

                #$extract = "<code class=html>" . $query->escapeHTML($extract) . "</code>";

                # Check if the line was truncated for the extract
                if ($preMatch > 0) {
                    $extract = "... $extract";
                    $spacesToAdd = $extraChars + 3 - $tabcount + $insertedSpaces;
                }
                if ($preMatch + $maxMatch < ulength($line, $oneChar)) {
                    $extract = "$extract ...";
                }

                # Link element names in extract
                $extract = linkElements($extract);

                print STDERR "  $extract\n";
            }

            # Prepare error message, adding emphasis and links where appropriate
            my $errorMsg;
            if ($error[5]) {
                $errorMsg = superChomp($error[5]);
            } else {
                $errorMsg = superChomp($error[4]);
            }
            while ($errorMsg =~ m#\{\{(?:&quot;)?(.+?)(?:&quot;)?\}\}#gos) {
                my $linkText = $1;
                my $lcLinkText = lc($linkText);
                $errorMsg =~ s#\{\{(&quot;)?$linkText(&quot;)?\}\}# $1$linkText$2#;
            }

            # Workaround for the incorrect display of the following error:
            #    value of attribute "NOWRAP" cannot be ""; must be one of
            #    "NOWRAP"
            $errorMsg =~ s#&quot;&quot;#""#go;

            $errorMsg =~ s#&quot;(.+?)&quot;#"$1"#g;

            if ($error[4] eq 'E' || $error[4] eq 'X') { # Error message
                print STDERR "  error: ";
            } elsif ($error[4] eq 'W') { # warning
                print STDERR "  warning: ";
            }
            print STDERR "$errorMsg\n";

        }

    } else { # no errors

        #print STDERR "  (no errors found)\n";
        return 1;

    }
    return 0;

}

# Trim leading and trailing whitespace from a string
# Takes a string as the first argument and returns the new string
sub trim {
    my $str = shift || return;
    $str =~ s/^\s+//go;
    $str =~ s/\s+$//go;
    return $str;
}

# Encode unsafe characters in a file URL
sub encodeFileURL {

  my $url = shift || return;

  $url =~ s/ /\%20/go;
  $url =~ s/"/\%22/go;
  $url =~ s/\#/\%23/go;

  return $url;

}

# Return the maximum of two numbers
sub max {
    if ($_[0] > $_[1]) {
        return $_[0];
    } else {
        return $_[1];
    }
}

# Expand tabs in a string
# Return a list of the number of spaces inserted, the number of
# tabs removed, and the expanded string
# (This is a modified version of Text::Tabs::expand.)
sub tabExpand
{
    my @l = @_;

    my $tabstop = 8;
    my $totalSpacesAdded = 0;
    my $totalTabCount = 0;

    for $_ (@l) {
        my $spacesAdded;
        my $tabCount;
        while (s/(^|\n)([^\t\n]*)(\t+)/
            $1. $2 . (" " x 
                ($spacesAdded = ($tabstop * ($tabCount = length($3))
                - (length($2) % $tabstop))))
            /gsex)
        {
            $totalSpacesAdded += $spacesAdded;
            $totalTabCount += $tabCount;
        }
    }

    return ($totalSpacesAdded, $totalTabCount, @l);
}


# Link element names in HTML code to the appropriate reference page
# The first argument is the input string
# Returns the string with links inserted
sub linkElements {

    my $code = shift || return;

    while ($code =~ m#&lt;([^\s&]+)#go) {
        my $linkText = $1;
        my $lcLinkText = lc($linkText);
    }

    return $code;
}


# Remove any newline characters (\r or \n) at the end of a string
# First argument is the string
# Returns the new string
sub superChomp {

    my $str = shift || return;
    $str =~ s/[\r\n]+$//o;
    return $str;

}


# Return the number of characters in a potentially-multibyte character string
# First argument is the string
# Second argument is a regular expression denoting a single character
# If the string is single-byte, the second argument should be '.' or omitted
sub ulength {

    my $str = shift || return 0;
    my $oneChar = (shift || '.');

    my $length = 0;

    if ($oneChar eq '.') {  # single-byte

        $length = length($str);

    } else {            # multibyte

        while ($str =~ /$oneChar/gos) {
            $length++;
        }

    }

    return $length;
}

# Return true if we like the Content-Type, false otherwise
# First argument must be the Content-Type, minus any parameters
sub checkContentType {
    my $type = shift;

    return ($type eq 'text/html'
        || $type eq 'text/xml'
        || $type eq 'application/xml'
        || $type eq 'application/xhtml+xml'
        || $type eq 'text/sgml'
        || $type eq 'application/sgml'
        || $type eq 'text/vnd.wap.wml'
        || $type eq 'text/x-wap.wml');
}

# Return true if the Content-Type indicates an XML document, false otherwise
# First argument must be the Content-Type, minus any parameters
sub isXMLType {
    my $type = shift;

    # XML media types
    my %xmlMediaTypes = (
        'text/xml' => 1,
        'application/xml' => 1,
        'application/xhtml+xml' => 1,
        'text/vnd.wap.wml' => 1,
        'text/x-wap.wml' => 1
    );


    return ($xmlMediaTypes{$type} || $type =~ /\+xml$/oi);
}

# Create temporary file securely
# Returns the name and file handle of the created file
sub getTempFile {
    my $filename;
    do {
        $filename = POSIX::tmpnam();
    } until sysopen(FH, $filename, O_RDWR|O_CREAT|O_EXCL, 0666);

    return ($filename, \*FH);
}

1;

Platon Group <platon@platon.sk> http://platon.sk/
Copyright © 2002-2006 Platon Group
Stránka používa redakčný systém Metafox
Na začiatok