<?php
/**
* HTML_Content - HTML content parser/extractor
*
* This class provides basic runtime environment for so called
* "directory applications". It can fetch, extract and parse HTML
* content. Package offers particular elements retrieval
* via available public methods.
*
* @author Ondrej Jombik <nepto@platon.sk>
* @package HTML_Content
* @version 1.0
* @access public
*/
/* $Platon: phpPlatonLib/HTML_Content/Content.php,v 1.25 2010-07-17 19:33:54 nepto Exp $ */
/* Notes:
HEAD elements:
<TITLE> - required
</TITLE> - required
<BASE> - required
</BASE> - forbidden
<NOSCRIPT> - required
</NOSCRIPT> - required
Repeatable HEAD elements:
<SCRIPT> - required
</SCRIPT> - required
<STYLE> - required
</STYLE> - required
<META> - required
</META> - forbidden
<LINK> - required
</LINK> - forbidden
<OBJECT> - required
</OBJECT> - required
*/
class HTML_Content
{
var $_data = array(
'doctype' => null,
'head' => null,
'head_attr' => null,
'head_tags' => null,
'body' => null,
'body_attr' => null,
'body_tags' => null
);
var $_content_type = null;
var $_new_url = null;
var $_cookies = array();
var $_raw_cookies = array();
var $_fp = null;
function HTML_Content($file = null, $cookies = null) /* {{{ */
{
$this->_cookies = is_array($cookies) ? $cookies : array();
if ($file != null) {
return $this->setInputFile($file);
}
return true;
} /* }}} */
/**
* Defines
*
* @param string Filename (full path)
* @return resource fopen handle of the given file
* @throws XML_Parser_Error
* @see setInput(), parse()
* @access public
*/
function setInputFile($file, $cookies = null) /* {{{ */
{
$this->_cookies = is_array($cookies) ? $cookies : $this->_cookies;
if (is_string($file)) {
$this->_file = $file;
return true;
}
return false;
} /* }}} */
/**
* Sets the file handle to use with parse().
*
* @param resource fopen
* @access public
* @see parse(), setInputFile()
*/
function setInput($fp) /* {{{ */
{
if (is_resource($fp)) {
$this->_fp = $fp;
return true;
}
return false;
} /* }}} */
function _getRemoteContent($url, &$data) /* {{{ */
{
if (empty($url)) {
return false;
}
include_once 'HTTP/Request.php';
for ($i = 0; ; $i++) {
$r = new HTTP_Request($url, array('allowRedirects' => false));
if ($i == 0) {
foreach ($_POST as $key => $val) {
$r->addPostData($key, $val);
}
if (count($_POST) > 0) {
$r->setMethod('POST');
}
}
foreach ($this->_cookies as $key => $val) {
$r->addCookie($key, $val);
}
foreach ($this->_raw_cookies as $cookie) {
$r->addCookie($cookie['name'], $cookie['value']);
}
$ret = $r->sendRequest();
if ($ret !== true) {
return $ret;
}
if ($r->getResponseCode() == 301) {
$url = $r->getResponseHeader('Location');
$this->_new_url = $url;
continue;
} else {
/*
if (strncasecmp('text/html', $r->getResponseHeader('Content-Type'), 9)) {
header('Content-Type: '.$r->getResponseHeader('Content-Type'));
echo $r->getResponseBody();
exit;
} */
$data = $r->getResponseBody();
$this->_content_type = $r->getResponseHeader('Content-Type');
$this->_raw_cookies = $r->getResponseCookies();
$this->_cookies = array();
if (! is_array($this->_raw_cookies)) {
$this->_raw_cookies = array();
}
foreach ($this->_raw_cookies as $cookie) {
$this->_cookies[$cookie['name']] = $cookie['value'];
}
break;
}
}
return true;
} /* }}} */
/**
* Central parsing function.
*
* @throws XML_Parser_Error
* @return boolean true on success
* @see parseString()
* @access public
*/
function parse($all_data = null) /* {{{ */
{
if ($all_data === null) {
if (! is_resource(@$this->_fp)) {
if (! strncmp($this->_file, "http://", 7)) {
if ($this->_getRemoteContent($this->_file, $all_data) !== true) {
return false;
}
} else {
$this->_fp = @fopen($this->_file, 'rb');
if (! is_resource($this->_fp)) {
return false;
}
}
}
if ($all_data == '' && is_resource($this->_fp)) {
while ($data = @fread($this->_fp, 2048)) {
$all_data .= $data;
}
fclose($this->_fp);
}
}
$this->_data['doctype'] = '';
$this->_data['head'] = '';
$this->_data['body'] = '';
$this->_data['head_attr'] = '';
$this->_data['body_attr'] = '';
! isset($this->_content_type) && $this->_content_type = 'text/html';
if (strncasecmp($this->_content_type, 'text/html', 9)) {
$this->_data['body'] = $all_data;
return true;
}
if (preg_match('|^\s*<!DOCTYPE ([^>]*)>|', $all_data, $matches)) {
$this->_data['doctype'] = trim($matches[1]);
}
if (preg_match('|<head([^>]*)>(.*)</head[^>]*>|si', $all_data, $matches)) {
$this->_data['head_attr'] = trim($matches[1]);
$this->_data['head'] = trim($matches[2]);
}
if (preg_match('{<body([^>]*)>(.*)$}si', $all_data, $matches)) {
/* This will try to find closing BODY or HTML tag and if not found,
content will remain till end of file although document is not
valid without these ending tags. */
$matches[2] = preg_replace('|^(.*)</body.*$|si','\\1',$matches[2]);
$matches[2] = preg_replace('|^(.*)</html.*$|si','\\1',$matches[2]);
$this->_data['body'] = trim($matches[2]);
$this->_data['body_attr'] = trim($matches[1]);
}
if ($this->_data['head'] == '' && $this->_data['body'] == '') {
$this->_data['body'] = $all_data;
}
return true;
} /* }}} */
function translateHead($ar) /* {{{ */
{
return $this->_translate('head', $ar);
} /* }}} */
function translateBody($ar) /* {{{ */
{
return $this->_translate('body', $ar);
} /* }}} */
function _translate($where, $ar) /* {{{ */
{
if (! is_array($ar)) {
return false;
}
foreach ($ar as $key => $val) {
$this->_data[$where] = str_replace($key, $val, $this->_data[$where]);
}
return true;
} /* }}} */
function _getTag($where, $name, $idx = -1) /* {{{ */
{
$ar =& $this->_data[$where.'_tags'][$name];
if (! isset($ar)) {
$ar = $this->_parseTag($where, $name);
}
if (! is_array(@$ar)) {
return false;
}
if ($idx < 0) {
return $ar;
}
if ($idx > count($ar)) {
return false;
}
return @$ar[$idx];
} /* }}} */
/* $where must be head/body, $name must be lowercase */
function _parseTag($where, $name) /* {{{ */
{
$ret = array();
$matches = array();
if (preg_match_all('|<'.$name.'([^>]*)>(.*)</'.$name.'[^>]*>|si',
$this->_data[$where], $matches) > 0) {
} elseif (preg_match_all('|<'.$name.'([^>]*)>|si',
$this->_data[$where], $matches)) {
}
for ($j = 0; $j < count($matches[1]); $j++) {
$ret[$j]['__attributes'] = $matches[1][$j];
isset($matches[2][$j]) && $ret[$j]['__content'] = $matches[2][$j];
$ar = preg_split('{([\w-]+)=(\'[^\']*\'|"[^"]*"|[^ ]*)}si',
$matches[1][$j], -1, PREG_SPLIT_DELIM_CAPTURE);
for ($i = 1; $i < count($ar); $i++) {
$attr = $ar[$i++];
$value = $ar[$i++];
$l_val = strlen($value);
if ($l_val > 1 && (($value[0] == "'" && $value[$l_val - 1] == "'")
|| ($value[0] == '"' && $value[$l_val - 1] == '"'))) {
$value = substr($value, 1, $l_val - 2);
}
$ret[$j][strtolower($attr)] = trim($value);
}
}
return $ret;
} /* }}} */
function getContentType() /* {{{ */
{
return $this->_content_type;
} /* }}} */
function getNewURL() /* {{{ */
{
return $this->_new_url;
} /* }}} */
function getCookies() /* {{{ */
{
return $this->_cookies;
} /* }}} */
function getRawCookies() /* {{{ */
{
return $this->_raw_cookies;
} /* }}} */
function getHTML() /* {{{ */
{
return '<!DOCTYPE '.$this->_data['doctype'].">\n"
."<HTML>\n"."<HEAD>\n".$this->_data['head']."\n</HEAD>\n"
.'<BODY '.$this->_data['body_attr'].">\n"
.$this->_data['body']."\n</BODY>\n</HTML>\n";
} /* }}} */
function getHead() /* {{{ */
{
return $this->_data['head'];
} /* }}} */
function getBody() /* {{{ */
{
return $this->_data['body'];
} /* }}} */
function getTitle() /* {{{ */
{
$title_tag = $this->_getTag('head', 'title', 0);
return @is_array($title_tag) ? @$title_tag['__content'] : '';
} /* }}} */
function getTag($where, $name, $attr = '', $value = '') /* {{{ */
{
$attr = strtolower($attr);
$where = strtolower($where);
if ($where != 'body') {
$where = 'head';
}
$tag = $this->_getTag($where, $name);
if ($attr == '') {
return $tag;
}
if ($value == '') {
for ($i = 0; $i < count($tag); $i++) {
if (isset($tag[$i][$attr])) {
return $tag[$i];
}
}
} else {
for ($i = 0; $i < count($tag); $i++) {
if (isset($tag[$i][$attr])
&& stristr($tag[$i][$attr], $value)) {
return $tag[$i];
}
}
}
return false;
} /* }}} */
function getComposedTag($where, $name, $attr = '', $value = '') /* {{{ */
{
$single = $attr == '' ? false : true;
$ar = $this->getTag($where, $name, $attr, $value);
if ($ar === false) {
return false;
}
if ($single) {
return HTML_Content::composeTag($name, $ar);
}
$out_ar = array();
foreach ($ar as $tag) {
$out_ar[] = HTML_Content::composeTag($name, $tag);
}
return $out_ar;
} /* }}} */
function composeTag($name, $attributes) /* {{{ */
{
$ret = htmlspecialchars($name);
foreach ($attributes as $attr => $value) {
if (! strncmp('__', $attr, 2)) {
continue;
}
$ret .= ' '.htmlspecialchars($attr).'=';
if (strchr($value, '"') && strchr($value, "'")) {
$value = str_replace('"', '', $value);
}
if (strchr($value, '"')) {
$ret .= "'".htmlspecialchars($value)."'";
} else {
$ret .= '"'.htmlspecialchars($value).'"';
}
}
$ret = '<'.$ret.'>';
if (isset($attributes['__content'])) {
$ret .= $attributes['__content'];
$ret .= '</'.htmlspecialchars($name).'>';
}
return $ret;
} /* }}} */
}
/* Modeline for ViM {{{
* vim: set ts=4:
* vim600: fdm=marker fdl=0 fdc=0:
* }}} */
?>
Platon Group <platon@platon.sk> http://platon.sk/
|