Salut je viens de trouver un script qui permet de parser du xhtml
mais les explication sont en anglais
serait il possible de commenter et expliquer le script
Que faut il paramettrer ou modifier pour parser la page www.machin.com ?
Code : Tout sélectionner
<?php
/**
* HTML/XML Parser Class
*
* This is a helper class that is used to parse HTML and XML. A unique feature of this parsing class
* is the fact that it includes support for innerHTML (which isn't easy to do).
*
* @author Dennis Pallett
* @copyright Dennis Pallett 2006
* @package HTML_Parser
* @version 1.0
*/
// Helper Class
// To parse HTML/XML
Class HTML_Parser {
// Private properties
var $_parser;
var $_tags = array();
var $_html;
var $output = array();
var $strXmlData;
var $_level = 0;
var $_outline;
var $_tagcount = array();
var $xml_error = false;
var $xml_error_code;
var $xml_error_string;
var $xml_error_line_number;
function get_html () {
return $this->_html;
}
function parse($strInputXML) {
$this->output = array();
// Translate entities
$strInputXML = $this->translate_entities($strInputXML);
$this->_parser = xml_parser_create ();
xml_parser_set_option($this->_parser, XML_OPTION_CASE_FOLDING, true);
xml_set_object($this->_parser,$this);
xml_set_element_handler($this->_parser, "tagOpen", "tagClosed");
xml_set_character_data_handler($this->_parser, "tagData");
$this->strXmlData = xml_parse($this->_parser,$strInputXML );
if (!$this->strXmlData) {
$this->xml_error = true;
$this->xml_error_code = xml_get_error_code($this->_parser);
$this->xml_error_string = xml_error_string(xml_get_error_code($this->_parser));
$this->xml_error_line_number = xml_get_current_line_number($this->_parser);
return false;
}
return $this->output;
}
function tagOpen($parser, $name, $attr) {
// Increase level
$this->_level++;
// Create tag:
$newtag = $this->create_tag($name, $attr);
// Build tag
$tag = array("name"=>$name,"attr"=>$attr, "level"=>$this->_level);
// Add tag
array_push ($this->output, $tag);
// Add tag to this level
$this->_tags[$this->_level] = $tag;
// Add to HTML
$this->_html .= $newtag;
// Add to outline
$this->_outline .= $this->_level . $newtag;
}
function create_tag ($name, $attr) {
// Create tag:
# Begin with name
$tag = '<' . strtolower($name) . ' ';
# Create attribute list
foreach ($attr as $key=>$val) {
$tag .= strtolower($key) . '="' . htmlentities($val) . '" ';
}
# Finish tag
$tag = trim($tag);
switch(strtolower($name)) {
case 'br':
case 'input':
$tag .= ' /';
break;
}
$tag .= '>';
return $tag;
}
function tagData($parser, $tagData) {
if(trim($tagData)) {
if(isset($this->output[count($this->output)-1]['tagData'])) {
$this->output[count($this->output)-1]['tagData'] .= $tagData;
} else {
$this->output[count($this->output)-1]['tagData'] = $tagData;
}
}
$this->_html .= htmlentities($tagData);
$this->_outline .= htmlentities($tagData);
}
function tagClosed($parser, $name) {
// Add to HTML and outline
switch (strtolower($name)) {
case 'br':
case 'input':
break;
default:
$this->_outline .= $this->_level . '</' . strtolower($name) . '>';
$this->_html .= '</' . strtolower($name) . '>';
}
// Get tag that belongs to this end
$tag = $this->_tags[$this->_level];
$tag = $this->create_tag($tag['name'], $tag['attr']);
// Try to get innerHTML
$regex = '%' . preg_quote($this->_level . $tag, '%') . '(.*?)' . preg_quote($this->_level . '</' . strtolower($name) . '>', '%') . '%is';
preg_match ($regex, $this->_outline, $matches);
// Get innerHTML
if (isset($matches['1'])) {
$innerhtml = $matches['1'];
}
// Remove level identifiers
$this->_outline = str_replace($this->_level . $tag, $tag, $this->_outline);
$this->_outline = str_replace($this->_level . '</' . strtolower($name) . '>', '</' . strtolower($name) . '>', $this->_outline);
// Add innerHTML
if (isset($innerhtml)) {
$this->output[count($this->output)-1]['innerhtml'] = $innerhtml;
}
// Fix tree
$this->output[count($this->output)-2]['children'][] = $this->output[count($this->output)-1];
array_pop($this->output);
// Decrease level
$this->_level--;
}
function translate_entities($xmlSource, $reverse =FALSE) {
static $literal2NumericEntity;
if (empty($literal2NumericEntity)) {
$transTbl = get_html_translation_table(HTML_ENTITIES);
foreach ($transTbl as $char => $entity) {
if (strpos('&"<>', $char) !== FALSE) continue;
$literal2NumericEntity[$entity] = '&#'.ord($char).';';
}
}
if ($reverse) {
return strtr($xmlSource, array_flip($literal2NumericEntity));
} else {
return strtr($xmlSource, $literal2NumericEntity);
}
}
}
// To be used like this
$parser = new HTML_Parser;
$output = $parser->parse($html);
print_r ($output);
?>