Platon Technologies
neprihlásený Prihlásiť Registrácia
SlovakEnglish
open source software development oslavujeme 10 rokov vývoja otvoreného softvéru! Streda, 21. máj 2025

Súbor: [Platon] / phpPlatonLib / HTML_Content / Content.php (stiahnutie)

Revízia 1.26, Mon Dec 12 21:33:31 2016 UTC (8 years, 5 months ago) by nepto


Zmeny od 1.25: +2 -2 [lines]

Ampersand fix regarding o PHP/7

<?php

/**
 * HTML_Content - HTML content parser/extractor
 *
 * This class provides basic runtime environment for so called
 * "directory applications". It can fetch, extract and parse HTML
 * content. Package offers particular elements retrieval
 * via available public methods.
 *
 * @author      Ondrej Jombik <nepto@platon.sk>
 * @package     HTML_Content
 * @version     1.0
 * @access      public
 */

/* $Platon: phpPlatonLib/HTML_Content/Content.php,v 1.25 2010-07-17 19:33:54 nepto Exp $ */

/* Notes:
   
   HEAD elements:

   <TITLE>     - required
   </TITLE>    - required
   <BASE>      - required
   </BASE>     - forbidden
   <NOSCRIPT>  - required
   </NOSCRIPT> - required

   Repeatable HEAD elements:

       <SCRIPT>   - required
    </SCRIPT>  - required
    <STYLE>    - required
    </STYLE>   - required
    <META>     - required
    </META>    - forbidden
    <LINK>     - required
    </LINK>    - forbidden
    <OBJECT>   - required
    </OBJECT>  - required

*/

class HTML_Content
{

    var $_data = array(
            'doctype'   => null,
            'head'      => null,
            'head_attr' => null,
            'head_tags' => null,
            'body'      => null,
            'body_attr' => null,
            'body_tags' => null
            );

    var $_content_type = null;
    var $_new_url      = null;
    var $_cookies      = array();
    var $_raw_cookies  = array();
    var $_fp           = null;

    function HTML_Content($file = null, $cookies = null) /* {{{ */
    {
        $this->_cookies = is_array($cookies) ? $cookies : array();
        if ($file != null) {
            return $this->setInputFile($file);
        }
        return true;
    } /* }}} */

    /**
     * Defines
     *
     * @param    string      Filename (full path)
     * @return   resource    fopen handle of the given file
     * @throws   XML_Parser_Error
     * @see      setInput(), parse()
     * @access   public
     */
    function setInputFile($file, $cookies = null) /* {{{ */
    {
        $this->_cookies = is_array($cookies) ? $cookies : $this->_cookies;
        if (is_string($file)) {
            $this->_file = $file;
            return true;
        }
        return false;
    } /* }}} */

    /**
     * Sets the file handle to use with parse().
     *
     * @param    resource    fopen
     * @access   public
     * @see      parse(), setInputFile()
     */
    function setInput($fp) /* {{{ */
    {
        if (is_resource($fp)) {
            $this->_fp = $fp;
            return true;
        }

        return false;
    } /* }}} */

    function _getRemoteContent($url, &$data) /* {{{ */
    {
        if (empty($url)) {
            return false;
        }

        include_once 'HTTP/Request.php';

        for ($i = 0; ; $i++) {
            $r = new HTTP_Request($url, array('allowRedirects' => false));
            if ($i == 0) {
                foreach ($_POST as $key => $val) {
                    $r->addPostData($key, $val);
                }
                if (count($_POST) > 0) {
                    $r->setMethod('POST');
                }
            }
            foreach ($this->_cookies as $key => $val) {
                $r->addCookie($key, $val);
            }
            foreach ($this->_raw_cookies as $cookie) {
                $r->addCookie($cookie['name'], $cookie['value']);
            }
            $ret = $r->sendRequest();
            if ($ret !== true) {
                return $ret;
            }
            if ($r->getResponseCode() == 301) {
                $url = $r->getResponseHeader('Location');
                $this->_new_url = $url;
                continue;
            } else {
                /*
                   if (strncasecmp('text/html', $r->getResponseHeader('Content-Type'), 9)) {
                   header('Content-Type: '.$r->getResponseHeader('Content-Type'));
                   echo $r->getResponseBody();
                   exit;
                   } */
                $data                = $r->getResponseBody();
                $this->_content_type = $r->getResponseHeader('Content-Type');
                $this->_raw_cookies  = $r->getResponseCookies();
                $this->_cookies      = array();
                if (! is_array($this->_raw_cookies)) {
                    $this->_raw_cookies = array();
                }
                foreach ($this->_raw_cookies as $cookie) {
                    $this->_cookies[$cookie['name']] = $cookie['value'];
                }
                break;
            }
        }
        return true;
    } /* }}} */

    /**
     * Central parsing function.
     *
     * @throws   XML_Parser_Error
     * @return   boolean true on success
     * @see      parseString()
     * @access   public
     */
    function parse($all_data = null) /* {{{ */
    {
        if ($all_data === null) {
            if (! is_resource(@$this->_fp)) {
                if (! strncmp($this->_file, "http://", 7)) {
                    if ($this->_getRemoteContent($this->_file, $all_data) !== true) {
                        return false;
                    }
                } else {
                    $this->_fp = @fopen($this->_file, 'rb');
                    if (! is_resource($this->_fp)) {
                        return false;
                    }
                }
            }
            if ($all_data == '' && is_resource($this->_fp)) {
                while ($data = @fread($this->_fp, 2048)) {
                    $all_data .= $data;
                }
                fclose($this->_fp);
            }
        }
        
        $this->_data['doctype']   = '';
        $this->_data['head']      = '';
        $this->_data['body']      = '';
        $this->_data['head_attr'] = '';
        $this->_data['body_attr'] = '';
        ! isset($this->_content_type) && $this->_content_type = 'text/html';

        if (strncasecmp($this->_content_type, 'text/html', 9)) {
            $this->_data['body'] = $all_data;
            return true;
        }

        if (preg_match('|^\s*<!DOCTYPE ([^>]*)>|', $all_data, $matches)) {
            $this->_data['doctype'] = trim($matches[1]);
        }
        if (preg_match('|<head([^>]*)>(.*)</head[^>]*>|si', $all_data, $matches)) {
            $this->_data['head_attr'] = trim($matches[1]);
            $this->_data['head']      = trim($matches[2]);
        }
        if (preg_match('{<body([^>]*)>(.*)$}si', $all_data, $matches)) {
            /* This will try to find closing BODY or HTML tag and if not found,
               content will remain till end of file although document is not
               valid without these ending tags. */
            $matches[2] = preg_replace('|^(.*)</body.*$|si','\\1',$matches[2]);
            $matches[2] = preg_replace('|^(.*)</html.*$|si','\\1',$matches[2]);
            $this->_data['body']      = trim($matches[2]);
            $this->_data['body_attr'] = trim($matches[1]);
        }
        if ($this->_data['head'] == '' && $this->_data['body'] == '') {
            $this->_data['body'] = $all_data;
        }
        return true;
    } /* }}} */

    function translateHead($ar) /* {{{ */
    {
        return $this->_translate('head', $ar);
    } /* }}} */
    
    function translateBody($ar) /* {{{ */
    {
        return $this->_translate('body', $ar);
    } /* }}} */

    function _translate($where, $ar) /* {{{ */
    {
        if (! is_array($ar)) {
            return false;
        }
        foreach ($ar as $key => $val) {
            $this->_data[$where] = str_replace($key, $val, $this->_data[$where]);
        }
        return true;
    } /* }}} */

    function _getTag($where, $name, $idx = -1) /* {{{ */
    {
        $ar =& $this->_data[$where.'_tags'][$name];
        if (! isset($ar)) {
            $ar = $this->_parseTag($where, $name);
        }
        if (! is_array(@$ar)) {
            return false;
        }
        if ($idx < 0) {
            return $ar;
        }
        if ($idx > count($ar)) {
            return false;
        }
        return @$ar[$idx];
    } /* }}} */

    /* $where must be head/body, $name must be lowercase */
    function _parseTag($where, $name) /* {{{ */
    {
        $ret     = array();
        $matches = array();

        if (preg_match_all('|<'.$name.'([^>]*)>(.*)</'.$name.'[^>]*>|si',
                    $this->_data[$where], $matches) > 0) {
        } elseif (preg_match_all('|<'.$name.'([^>]*)>|si',
                    $this->_data[$where], $matches)) {
        }

        for ($j = 0; $j < count($matches[1]); $j++) {
            $ret[$j]['__attributes'] = $matches[1][$j];
            isset($matches[2][$j]) && $ret[$j]['__content'] = $matches[2][$j];
            $ar = preg_split('{([\w-]+)=(\'[^\']*\'|"[^"]*"|[^ ]*)}si',
                    $matches[1][$j], -1, PREG_SPLIT_DELIM_CAPTURE);
            for ($i = 1; $i < count($ar); $i++) {
                $attr  = $ar[$i++];
                $value = $ar[$i++];
                $l_val = strlen($value);
                if ($l_val > 1 && (($value[0] == "'" && $value[$l_val - 1] == "'")
                            || ($value[0] == '"' && $value[$l_val - 1] == '"'))) {
                    $value = substr($value, 1, $l_val - 2);
                }

                $ret[$j][strtolower($attr)] = trim($value);
            }
        }

        return $ret;
    } /* }}} */

    function getContentType() /* {{{ */
    {
        return $this->_content_type;
    } /* }}} */
    
    function getNewURL() /* {{{ */
    {
        return $this->_new_url;
    } /* }}} */

    function getCookies() /* {{{ */
    {
        return $this->_cookies;
    } /* }}} */

    function getRawCookies() /* {{{ */
    {
        return $this->_raw_cookies;
    } /* }}} */

    function getHTML() /* {{{ */
    {
        return '<!DOCTYPE '.$this->_data['doctype'].">\n"
            ."<HTML>\n"."<HEAD>\n".$this->_data['head']."\n</HEAD>\n"
            .'<BODY '.$this->_data['body_attr'].">\n"
            .$this->_data['body']."\n</BODY>\n</HTML>\n";
    } /* }}} */

    function getHead() /* {{{ */
    {
        return $this->_data['head'];
    } /* }}} */

    function getBody() /* {{{ */
    {
        return $this->_data['body'];
    } /* }}} */

    function getTitle() /* {{{ */
    {
        $title_tag = $this->_getTag('head', 'title', 0);
        return @is_array($title_tag) ? @$title_tag['__content'] : '';
    } /* }}} */

    function getTag($where, $name, $attr = '', $value = '') /* {{{ */
    {
        $attr  = strtolower($attr);
        $where = strtolower($where);
        if ($where != 'body') {
            $where = 'head';
        }
        $tag = $this->_getTag($where, $name);
        if ($attr == '') {
            return $tag;
        }

        if ($value == '') {
            for ($i = 0; $i < count($tag); $i++) {
                if (isset($tag[$i][$attr])) {
                    return $tag[$i];
                }
            }
        } else {
            for ($i = 0; $i < count($tag); $i++) {
                if (isset($tag[$i][$attr])
                        && stristr($tag[$i][$attr], $value)) {
                    return $tag[$i];
                }
            }
        }
        return false;
    } /* }}} */

    function getComposedTag($where, $name, $attr = '', $value = '') /* {{{ */
    {
        $single = $attr == '' ? false : true;
        $ar = $this->getTag($where, $name, $attr, $value);
        if ($ar === false) {
            return false;
        }
        if ($single) {
            return HTML_Content::composeTag($name, $ar);
        }
        $out_ar = array();
        foreach ($ar as $tag) {
            $out_ar[] = HTML_Content::composeTag($name, $tag);
        }
        return $out_ar;
    } /* }}} */

    function composeTag($name, $attributes) /* {{{ */
    {
        $ret = htmlspecialchars($name);
        foreach ($attributes as $attr => $value) {
            if (! strncmp('__', $attr, 2)) {
                    continue;
            }

            $ret .= ' '.htmlspecialchars($attr).'=';
            if (strchr($value, '"') && strchr($value, "'")) {
                $value = str_replace('"', '', $value);
            }
            if (strchr($value, '"')) {
                $ret .= "'".htmlspecialchars($value)."'";
            } else {
                $ret .= '"'.htmlspecialchars($value).'"';
            }
        }
        $ret = '<'.$ret.'>';
        if (isset($attributes['__content'])) {
            $ret .= $attributes['__content'];
            $ret .= '</'.htmlspecialchars($name).'>';
        }
        return $ret;
    } /* }}} */

}

/* Modeline for ViM {{{
 * vim: set ts=4:
 * vim600: fdm=marker fdl=0 fdc=0:
 * }}} */

?>

Platon Group <platon@platon.sk> http://platon.sk/
Copyright © 2002-2006 Platon Group
Stránka používa redakčný systém Metafox
Na začiatok