You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

561 lines
25 KiB

<?php
// (c) Copyright by authors of the Tiki Wiki CMS Groupware Project
//
// All Rights Reserved. See copyright.txt for details and a complete list of authors.
// Licensed under the GNU LESSER GENERAL PUBLIC LICENSE. See license.txt for details.
// $Id$
if (! defined("_ECHOSERVER_HTML_PARSER")) {
define("_ECHOSERVER_HTML_PARSER", 1);
/**
*
*/
class HtmlParser
{
public $pos;
public $tagpos;
public $length;
public $data;
public $stacktag;
public $stacktagpos;
public $name;
public $quotstate;
public $quottype;
public $parname;
public $pars;
public $tagname;
public $content;
public $contentpos;
public $allreadyparsed;
public $pg;
public $dc;
public $nc;
public $qc;
public $prevstate;
public $processtag;
public $processpar;
public $processparvalue;
public $c;
public $cp;
public $text;
public $incomment;
public $skipto;
public $tagreg;
public $wasquot;
/**********************************************************************************
* Class constructor
**********************************************************************************/
public function __construct($data, $grammar, $name = "", $datatype = 0)
{
$this->dc = [" ","\t","\r","\n","<",">","\"","'","=","/"];
$this->nc = ["<",">","=","/"];
$this->qc = ["\"","'"];
$this->sc = ["\r","\n"," ","\t"];
$this->prevstate = ["state" => 0,"word" => ""];
$this->pg=&$grammar;
$this->pos = 0;
$this->stacktag = [];
$this->stacktagpos = -1;
$this->content = [];
$this->content["contentpos"] = -1;
$this->c=&$this->content;
$this->cp = -1;
$this->quotstate = -1;
$this->allreadyparsed = 0;
$this->text = "";
$this->processtag = 0;
$this->processpar = 0;
$this->processparvalue = 0;
$this->slevel = [0];
$this->slevelpos = 0;
$this->quottype = "";
$this->skipto = "";
$this->incomment = 0;
$this->tagreg = [];
$this->wasquot = 0;
$this->pars = [];
if (isset($this->data) && is_array($this->data)) {
$this->content=&$data;
$this->allreadyparsed = 1;
return;
}
clearstatcache();
$this->name = $data;
if (! $datatype) {
$this->name = $name;
$this->data = $data;
$this->length = strlen($this->data);
return;
}
if (! $fp = fopen($this->name, "rb")) {
$this->SetError(1, "Can't open file $this->name.", 0, 0, "Error");
return;
}
flock($fp, 1);
$this->data = fread($fp, filesize($this->name));
flock($fp, 3);
fclose($fp);
$this->length = strlen($this->data);
}
/********************************************************************************************
* Get word from data
********************************************************************************************/
public function GetWord(&$word)
{
$word = "";
$this->wasquot = 0;
if ($this->pos > $this->length) {
return false;
}
while (1) {
if ($this->pos > $this->length) {
return false;
}
if ($this->pos == $this->length) {
$this->pos++;
return true;
}
if ($this->data[$this->pos] == "<") {
if ($this->data[$this->pos + 1] == "!") {
if ($this->length > 6 && $this->length - $this->pos + 1 > 6) {
if (substr($this->data, $this->pos, 4) == "<!--") {
$this->incomment = 1;
while ($this->pos < $this->length - 3) {
if (substr($this->data, $this->pos, 3) == "-->") {
$word .= "-->";
$this->pos += 3;
break;
} else {
$word .= $this->data[$this->pos++];
}
}
if ($this->incomment) {
break;
}
}
}
}
}
if (! $this->processtag) {
if ($this->data[$this->pos] == "<") {
$this->processtag = 1;
$this->tagpos = strlen($this->text);
} else {
$this->text .= $this->data[$this->pos++];
continue;
}
}
if (in_array($this->data[$this->pos], $this->dc)) {
if (($this->data[$this->pos] == "<" || $this->data[$this->pos] == ">") && $this->quotstate == -1 && $this->processparvalue) {
$this->processparvalue = 0;
return true;
}
if (in_array($this->data[$this->pos], $this->sc) && $this->quotstate == -1) {
$this->text .= $this->data[$this->pos++];
if (strlen($word)) {
if ($this->processparvalue) {
$this->processparvalue = 0;
}
return true;
} else {
continue;
}
}
if (! strlen($word)) {
if (in_array($this->data[$this->pos], $this->qc) && $this->processpar) {
if ($this->quotstate == -1) {
$this->wasquot = 1;
$this->quotstate *= -1;
$this->quottype = $this->data[$this->pos];
$this->text .= $this->data[$this->pos++];
continue;
} elseif ($this->quottype == $this->data[$this->pos]) {
$this->quotstate *= -1;
$this->quottype = $this->data[$this->pos];
$this->processpar = $this->processparvalue = 0;
$this->text .= $this->data[$this->pos++];
return true;
}
} elseif (in_array($this->data[$this->pos], $this->nc)) {
$word .= $this->data[$this->pos];
$this->text .= $this->data[$this->pos++];
if ($this->processparvalue) {
continue;
} else {
return true;
}
}
} else {
if (in_array($this->data[$this->pos], $this->qc) && $this->processpar) {
if ($this->quotstate == 1) {
if ($this->data[$this->pos] == $this->quottype && $this->processparvalue) {
$this->quotstate *= -1;
$this->quottype = $this->data[$this->pos];
$this->processpar = $this->processparvalue = 0;
$this->text .= $this->data[$this->pos++];
// continue;
} else {
if ($this->data[$this->pos] == $this->quottype) {
$this->quotstate *= -1;
$this->quottype = "";
}
$word .= $this->data[$this->pos];
$this->text .= $this->data[$this->pos++];
continue;
}
}
return true;
} else {
if (in_array($this->data[$this->pos], $this->nc)) {
if ($this->quotstate == -1) {
if ($this->processparvalue) {
if ($this->data[$this->pos] != "/" && $this->data[$this->pos] != "=") {
return true;
}
$word .= $this->data[$this->pos];
$this->text .= $this->data[$this->pos++];
continue;
}
} else {
$word .= $this->data[$this->pos];
$this->text .= $this->data[$this->pos++];
continue;
}
return true;
} elseif ($this->quotstate == -1 && $this->processparvalue && strlen($word)) {
if ($this->data[$this->pos] == " ") {
$this->text .= $this->data[$this->pos++];
$this->processparvalue = 0;
return true;
}
}
}
}
}
$word .= $this->data[$this->pos];
$this->text .= $this->data[$this->pos++];
}
return true;
}
/********************************************************************************************
* Parse HTML code
********************************************************************************************
<tagname [parname=|parnane=["|']parvalue["|']|parname][/]> |
<[/]tagname>
in/state 0 1 2 3 4 5 6 7 8
< 1 -1 -1 -1 -1 -1 -1 -1 -1
/ -1 7 6 6 6 6 -1 -1 -1
= -1 -1 -1 4 -1 -1 -1 -1 -1
> -1 -1 -2 -2 -2 -2 -2 -1 -3
anyword -1 2 3 3 5 3 -1 8 -1
-3 end parse close tag
-2 end parse open tag
-1 error
0 begin parse
1 got '<', waiting '/' or any word as tag name
2 got any word as tagname, waiting '/' or '>' or any word as parameter name
3 got any word as parameter name, waiting '/' or '>' or '=' or any word as parameter name
4 got '=' waiting '/' or '>' or any word as parameter value
5 got any word as parameter value, waiting '/' or '>' or any word as parameter name
6 got '/' waiting '>'
7 got '/', waiting any word as close tagname
8 got any word as close tag name, waiting '>'
********************************************************************************************/
public function Parse()
{
$automat = [
// states 0 1 2 3 4 5 6 7 8
"0" => [ 1, -1, -1, -1, -1, -1, -1, -1, -1],// <
"1" => [-1, 7, 6, 6, 6, 6, -1, -1, -1],// /
"2" => [-1, -1, -1, 4, -1, -1, -1, -1, -1],// =
"3" => [-1, -1, -2, -2, -2, -2, -2, -1, -3],// >
"4" => [-1, 2, 3, 3, 5, 3, -1, 8, -1] // any word
];
if (! strlen($this->data)) {
return;
}
$instates = ["<" => 0,"/" => 1,"=" => 2,">" => 3];
$parcount = 0;
$state = 0;
$this->c=&$this->content;
$this->cp=&$this->content["contentpos"];
$this->stacktag[0]["tag"]=&$this->c;
$this->stacktag[0]["level"]=&$this->slevel;
$this->stacktag[0]["levelpos"] = 0;
$this->stacktagpos = 0;
while (1) {
if (! $isword = $this->GetWord($word)) {
break;
}
$w = strtolower($word);
if (! isset($instates[$w])) {
$instate = 4;
} else {
$instate = $instates[$w];
}
//print htmlspecialchars($word).",$state,$instate,$this->quottype<br>";
$state = $automat[$instate][$state];
if ($this->wasquot && $state == 6) {
$state = 5;
}
//print htmlspecialchars($word).",$state<br>";
switch ($state) {
case -3:// end parse close tag
if (strlen($this->skipto) && $this->tagname != $this->skipto) {
$parcount = $state = $this->processpar = $this->processparvalue = $this->processtag = 0;
$this->pars = [];
break;
} else {
$this->skipto = "";
}
$script = ($this->tagname == "script") ? 1 : 0;
$this->AddNewText(substr($this->text, 0, $this->tagpos), $script);
$this->AddNewTag(0);
$parcount = $state = $this->processpar = $this->processparvalue = $this->processtag = 0;
$this->quottype = "";
$this->quotstate = -1;
$this->text = "";
$this->pars = [];
$this->tagpos = 0;
break;
case -2:// end parse open tag
if (strlen($this->skipto)) {
$parcount = $state = $this->processpar = $this->processparvalue = $this->processtag = 0;
$this->pars = [];
break;
}
$this->AddNewText(substr($this->text, 0, $this->tagpos));
$this->AddNewTag(1, $xmlclose);
$parcount = $state = $this->processpar = $this->processparvalue = $this->processtag = 0;
$this->quottype = "";
$this->quotstate = -1;
$this->text = "";
$this->pars = [];
$this->tagpos = 0;
if (isset($this->pg[$this->tagname]["nohavetags"]) && ! strlen($this->skipto)) {
$this->skipto = $this->tagname;
}
break;
case -1:// Error found
$parcount = $state = $this->processpar = $this->processparvalue = $this->processtag = 0;
$this->pars = [];
if ($this->incomment) {
if (strlen($this->text)) {
$this->AddNewText($this->text);
$this->text = "";
$this->tagpos = 0;
}
$this->AddNewText($word, 0, 1);
$this->incomment = 0;
break;
}
if ($word == "<") {
$state = 1;
$this->processtag = 1;
$this->processparvalue = 0;
$this->tagpos = strlen($this->text) - 1;
$this->quottype = "";
$this->quotstate = -1;
}
break;
case 2:// got any word as tagname, waiting '/' or '>' or any word as parameter name
$this->tagname = $w;
$xmlclose = 0;
if (! preg_match("/^[a-zA-Z0-9!_-]+$/", $this->tagname) || strlen($this->skipto)) {
$parcount = $state = $this->processpar = $this->processparvalue = $this->processtag = 0;
$this->quottype = "";
$this->quotstate = -1;
$this->pars = [];
break;
}
break;
case 3:// got any word as parameter name, waiting '/' or '>' or '=' or any word as parameter name
$this->parname = $w;
if (! preg_match("/^[a-zA-Z0-9!_-]+$/", $this->parname) || strlen($this->skipto)) {
$parcount = $state = $this->processpar = $this->processparvalue = $this->processtag = 0;
$this->quottype = "";
$this->quotstate = -1;
$this->pars = [];
break;
}
$this->processpar = 1;
if ($w != "/") {
$parcount++;
$this->pars[$this->parname]["single"] = 1;
} else {
$xmlclose = 1;
}
break;
case 4:// got '=' waiting '/' or '>' or any word as parameter value
$this->processparvalue = 1;
break;
case 5:// got any word as parameter value, waiting '/' or '>' or any word as parameter name
if ($this->parname != "/") {
unset($this->pars[$this->parname]["single"]);
$this->pars[$this->parname]["value"] = $word;
$this->pars[$this->parname]["quot"] = $this->quottype;
}
$this->quottype = "";
$this->processpar = $this->processparvalue = 0;
break;
case 6:// got '/' waiting '>'
$xmlclose = 1;
break;
case 8:// got any word as close tag name, waiting '>'
$this->tagname = $w;
break;
}
$this->prevstate["states"] = $state;
$this->prevstate["word"] = $word;
}
if (strlen($this->text)) {
$this->AddNewText($this->text);
}
}
/********************************************************************************************
* Add new tag
********************************************************************************************/
public function AddNewTag($open, $xmlclose = 0)
{
$actionclose = 0;
if (! $open && in_array($this->tagname, $this->pg) && $this->pg[$this->tagname]["endtag"] != "absent") {
$actionclose = 1;
}
if ($open) {
for ($i = $this->stacktagpos; $i > 0; $i--) {
$ct=&$this->stacktag[$i]["tag"];
$t=&$ct[$ct["contentpos"]];
$tagname = $t["data"]["name"];
if (isset($this->pg[$tagname]["closeon"])) {
if (isset($this->pg[$tagname]["closeon"]["in"]) && sizeof($this->pg[$tagname]["closeon"]["in"]) && in_array($this->tagname, $this->pg[$tagname]["closeon"]["in"])
|| isset($this->pg[$tagname]["closeon"]["notin"]) && sizeof($this->pg[$tagname]["closeon"]["notin"]) && ! in_array($this->tagname, $this->pg[$tagname]["closeon"]["notin"])) {
$actionclose = 2;
break;
}
}
if ($actionclose != 2) {
$i = -1;
}
}
}
if ($actionclose) {
if ($actionclose == 1) {
$i = $this->FindTag($this->tagname);
if ($i > -1) {
if ($this->tagreg[$this->tagname] != $this->stacktag[$i]["num"]) {
$i = -1;
}
}
}
if ($i > -1) {
$this->c=&$this->stacktag[$i]["tag"];
$this->cp=&$this->c["contentpos"];
$this->stacktagpos = $i;
if ($actionclose == 1) {
$c=&$this->c[$this->c["contentpos"]]["content"];
$cp=&$this->c[$this->c["contentpos"]]["content"]["contentpos"];
$cp++;
$c[$cp]["type"] = "tag";
$c[$cp]["data"]["name"] = $this->tagname;
$c[$cp]["data"]["type"] = "close";
if (isset($this->tagreg[$this->tagname])) {
if ($this->tagreg[$this->tagname]) {
$this->tagreg[$this->tagname]--;
}
}
$this->stacktag[$this->stacktagpos]["num"] = $this->tagreg[$this->tagname];
$this->stacktagpos--;
}
if ($this->stacktagpos < sizeof($this->stacktag)) {
for ($i = $this->stacktagpos + 1; $i < sizeof($this->stacktag); $i++) {
unset($this->stacktag[$i]);
}
}
if ($actionclose == 1) {
return;
}
}
}
$this->cp++;
$this->c[$this->cp]["type"] = "tag";
$this->c[$this->cp]["data"]["name"] = $this->tagname;
$this->c[$this->cp]["data"]["type"] = ($open) ? "open" : "close";
if (! $open) {
if (isset($this->tagreg[$this->tagname])) {
if ($this->tagreg[$this->tagname]) {
$this->tagreg[$this->tagname]--;
}
}
}
if ($xmlclose) {
$this->c[$this->cp]["xmlclose"] = 1;
}
if (sizeof($this->pars)) {
$this->c[$this->cp]["pars"] = $this->pars;
}
if ($open && ! $xmlclose && in_array($this->tagname, $this->pg) && $this->pg[$this->tagname]["endtag"] != "absent") {
if (! isset($this->tagreg[$this->tagname])) {
$this->tagreg[$this->tagname] = 0;
}
$this->tagreg[$this->tagname]++;
$this->stacktagpos++;
$this->stacktag[$this->stacktagpos]["tag"]=&$this->c;
$this->stacktag[$this->stacktagpos]["num"] = $this->tagreg[$this->tagname];
$this->c[$this->cp]["content"] = [];
$this->c[$this->cp]["content"]["contentpos"] = -1;
$this->c=&$this->c[$this->cp]["content"];
$this->cp=&$this->c["contentpos"];
}
}
/********************************************************************************************
* Add new text
********************************************************************************************/
public function AddNewText($text, $script = 0, $comment = 0)
{
if (! strlen($text)) {
return;
}
$this->cp++;
if (! $comment) {
$this->c[$this->cp]["type"] = "text";
} else {
$this->c[$this->cp]["type"] = "comment";
}
if ($script) {
$inputarray = ["/_top/","/top.location.href/","/([ \n]+)?window\.name/","/parent.location/"];
$replarray = ["_echoserver_file_space","parent.frames('_echoserver_file_space').src","//window.name","parent.frames('_echoserver_file_space').src"];
/*
$text=str_replace("_top","_echoserver_file_space",$text);
$text=str_replace("top.location.href","parent.frames('_echoserver_file_space').src",$text);
$text=preg_replace("/([ \n]+)?window\.name/","//window.name",$text);
*/
$text = preg_replace($inputarray, $replarray, $text);
}
$this->c[$this->cp]["data"] = $text;
$this->text = "";
}
/********************************************************************************************
* Find first tag in stack
********************************************************************************************/
public function FindTag($tagname)
{
for ($i = $this->stacktagpos; $i >= 0; $i--) {
if ($this->stacktag[$i]["tag"][$this->stacktag[$i]["tag"]["contentpos"]]["data"]["name"] == $tagname) {
return $i;
}
}
return -1;
}
}
} //_ECHOSERVER_HTML_PARSER