Помогите с HtmlParser на PHP

Статус
В этой теме нельзя размещать новые ответы.
найти все вхождения strip_tags и подправить

PHP:
<?
/////////////////////////////////////////////////////////////////////////////////////////////
//парсилка html
////////////////////////////////////////////////////////////////////////////////////////////

error_reporting(0);
class HtmlParser {
  var $pos,
      $tagpos,
      $length,
      $data,
      $stacktag,
      $stacktagpos,
      $name,
      $quotstate,
      $quottype,
      $parname,
      $pars,
      $tagname,
      $content,
      $contentpos,
      $allreadyparsed,
      $pg,
      $dc,
      $nc,
      $qc,
      $prevstate,
      $processtag,
      $processpar,
      $processparvalue,
      $c,
      $cp,
      $text,
      $incomment,
      $skipto,
      $tagreg,
      $arr,
      $wasquot;

  function HtmlParser($data,$grammar,$name="",$datatype=0) {
    $this->dc=array(" ","\t","\r","\n","<",">","\"","'","=","/");
    $this->nc=array("<",">","=","/");
    $this->qc=array("\"","'");
    $this->sc=array("\r","\n"," ","\t");
    $this->prevstate=array("state"=>0,"word"=>"");
    $this->pg=&$grammar;
    $this->pos=0;
    $this->stacktag=array();
    $this->stacktagpos=-1;
    $this->content=array();
    $this->content["contentpos"]=-1;
    $this->c=&$this->content;
    $this->cp=-1;
    $this->quotstate=-1;
    $this->allreadyparsed=0;
    $this->text="";
    $this->processtag=0;
    $this->processpar=0;
    $this->processparvalue=0;
    $this->slevel=array(0);
    $this->slevelpos=0;
    $this->quottype="";
    $this->skipto="";
    $this->incomment=0;
    $this->tagreg=array();
    $this->arr=array();
    $this->wasquot=0;

    if(is_array($this->data)) {
      $this->content=&$data;
      $this->allreadyparsed=1;
      return;
    }
    clearstatcache();
    $this->name=$data;
    if (!$datatype) {
      $this->name=$name;
      $this->data=$data;
      $this->length=strlen($this->data);
      return;
    }
    if (!$fp=fopen($this->name,"rb")) {
      $this->SetError(1,"Can't open file $this->name.",0,0,"Error");
      return;
    }
    flock($fp,1);
    $this->data=fread($fp,filesize($this->name));
    flock($fp,3);
    fclose($fp);
    $this->length=strlen($this->data);
  }


  function GetWord($word) {
    $word="";
    $this->wasquot=0;
    if ($this->pos>$this->length) return false;
    while (1) {
      if ($this->pos>$this->length) return false;
      if ($this->pos==$this->length) {
        $this->pos++;
        return true;
      }
      if ($this->data[$this->pos]=="<") {
        if ($this->data[$this->pos+1]=="!")
          if ($this->length>6 && $this->length-$this->pos+1>6) {
            if (substr($this->data,$this->pos,4)=="<!--") {
              $this->incomment=1;
              while($this->pos<$this->length-3) {
                if (substr($this->data,$this->pos,3)=="-->") {
                  $word.="-->";
                  $this->pos+=3;
                  break;
                } else
                  $word.=$this->data[$this->pos++];
              }
              if ($this->incomment) break;
            }
          }
      }
      if (!$this->processtag) {
        if ($this->data[$this->pos]=="<") {
          $this->processtag=1;
          $this->tagpos=strlen($this->text);
        } else {
          $this->text.=$this->data[$this->pos++];
          continue;
        }
      }
      if (in_array($this->data[$this->pos],$this->dc)) {
        if (($this->data[$this->pos]=="<" || $this->data[$this->pos]==">") && $this->quotstate==-1 && $this->processparvalue) {
          $this->processparvalue=0;
          return true;
        }
        if (in_array($this->data[$this->pos],$this->sc) && $this->quotstate==-1) {
          $this->text.=$this->data[$this->pos++];
          if (strlen($word)) {
            if ($this->processparvalue) $this->processparvalue=0;
            return true;
          } else
            continue;
        }
        if (!strlen($word)) {
          if (in_array($this->data[$this->pos],$this->qc) && $this->processpar) {
            if ($this->quotstate==-1) {
              $this->wasquot=1;
              $this->quotstate*=-1;
              $this->quottype=$this->data[$this->pos];
              $this->text.=$this->data[$this->pos++];
              continue;
            } elseif ($this->quottype==$this->data[$this->pos]) {
              $this->quotstate*=-1;
              $this->quottype=$this->data[$this->pos];
              $this->processpar=$this->processparvalue=0;
              $this->text.=$this->data[$this->pos++];
              return true;
            }
          } elseif (in_array($this->data[$this->pos],$this->nc)) {
            $word.=$this->data[$this->pos];
            $this->text.=$this->data[$this->pos++];
            if ($this->processparvalue)
              continue;
            else
              return true;
          }
        } else {
          if (in_array($this->data[$this->pos],$this->qc) && $this->processpar) {
            if ($this->quotstate==1) {
              if ($this->data[$this->pos]==$this->quottype && $this->processparvalue) {
                $this->quotstate*=-1;
                $this->quottype=$this->data[$this->pos];
                $this->processpar=$this->processparvalue=0;
                $this->text.=$this->data[$this->pos++];
//                continue;
              } else {
                if ($this->data[$this->pos]==$this->quottype) {
                  $this->quotstate*=-1;
                  $this->quottype="";
                }
                $word.=$this->data[$this->pos];
                $this->text.=$this->data[$this->pos++];
                continue;
              }
            }
            return true;
          } else {
            if (in_array($this->data[$this->pos],$this->nc)) {
              if ($this->quotstate==-1) {
                if ($this->processparvalue) {
                  if($this->data[$this->pos]!="/" && $this->data[$this->pos]!="=") return true;
                  $word.=$this->data[$this->pos];
                  $this->text.=$this->data[$this->pos++];
                  continue;
                }
              } else {
                $word.=$this->data[$this->pos];
                $this->text.=$this->data[$this->pos++];
                continue;
              }
              return true;
            } elseif ($this->quotstate==-1 && $this->processparvalue && strlen($word)) {
              if ($this->data[$this->pos]==" ") {
                $this->text.=$this->data[$this->pos++];
                $this->processparvalue=0;
                return true;
              }
            }
          }
        }
      }
      $word.=$this->data[$this->pos];
      $this->text.=$this->data[$this->pos++];
    }
    return true;
  }


  function Parse() {
    $automat=array(
// states         0   1   2   3   4   5   6   7   8
      "0"=>array( 1, -1, -1, -1, -1, -1, -1, -1, -1),// <
      "1"=>array(-1,  7,  6,  6,  6,  6, -1, -1, -1),// /
      "2"=>array(-1, -1, -1,  4, -1, -1, -1, -1, -1),// =
      "3"=>array(-1, -1, -2, -2, -2, -2, -2, -1, -3),// >
      "4"=>array(-1,  2,  3,  3,  5,  3, -1,  8, -1) // any word
    );
    if (!strlen($this->data)) return;
    $instates=array("<"=>0,"/"=>1,"="=>2,">"=>3);
    $parcount=0;
    $state=0;
    $this->c=&$this->content;
    $this->cp=&$this->content["contentpos"];
    $this->stacktag[0]["tag"]=&$this->c;
    $this->stacktag[0]["level"]=&$this->slevel;
    $this->stacktag[0]["levelpos"]=0;
    $this->stacktagpos=0;
    while(1) {
      if (!$isword=$this->GetWord(&$word)) break;
      $w=strtolower($word);
      if (!isset($instates[$w]))
        $instate=4;
      else
        $instate=$instates[$w];
//print htmlspecialchars($word).",$state,$instate,$this->quottype<br>";
      $state=$automat[$instate][$state];
      if ($this->wasquot && $state==6) $state=5;
//print htmlspecialchars($word).",$state<br>";
      switch($state) {
        case -3:// end parse close tag
          if (strlen($this->skipto) && $this->tagname!=$this->skipto) {
            $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
            $this->pars=array();
            break;
          } else
            $this->skipto="";
          $script=($this->tagname=="script") ? 1:0;
          $this->AddNewText(substr($this->text,0,$this->tagpos),$script);
          $this->AddNewTag(0);
          $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
          $this->quottype="";
          $this->quotstate=-1;
          $this->text="";
          $this->pars=array();
          $this->tagpos=0;
          break;
        case -2:// end parse open tag
          if (strlen($this->skipto)) {
            $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
            $this->pars=array();
            break;
          }
          $this->AddNewText(substr($this->text,0,$this->tagpos));
          $this->AddNewTag(1,$xmlclose);
          $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
          $this->quottype="";
          $this->quotstate=-1;
          $this->text="";
          $this->pars=array();
          $this->tagpos=0;
          if (isset($this->pg[$this->tagname]["nohavetags"]) && !strlen($this->skipto)) $this->skipto=$this->tagname;
          break;
        case -1:// Error found
          $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
          $this->pars=array();
          if ($this->incomment) {
            if (strlen($this->text)) {
              $this->AddNewText($this->text);
              $this->text="";
              $this->tagpos=0;
            }
            $this->AddNewText($word,0,1);
            $this->incomment=0;
            break;
          }
          if ($word=="<") {
            $state=1;
            $this->processtag=1;
            $this->processparvalue=0;
            $this->tagpos=strlen($this->text)-1;
            $this->quottype="";
            $this->quotstate=-1;
          }
          break;
        case 2:// got any word as tagname, waiting '/' or '>' or any word as parameter name
          $this->tagname=$w;
          $xmlclose=0;
          if (!ereg("^[a-zA-Z0-9!_-]+$",$this->tagname) || strlen($this->skipto)) {
            $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
            $this->quottype="";
            $this->quotstate=-1;
            $this->pars=array();
            break;
          }
          break;
        case 3:// got any word as parameter name, waiting '/' or '>' or '=' or any word as parameter name
          $this->parname=$w;
          if (!ereg("^[a-zA-Z0-9!_-]+$",$this->parname) || strlen($this->skipto)) {
            $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
            $this->quottype="";
            $this->quotstate=-1;
            $this->pars=array();
            break;
          }
          $this->processpar=1;
          if ($w!="/") {
            $parcount++;
            $this->pars[$this->parname]["single"]=1;
          } else
            $xmlclose=1;
          break;
        case 4:// got '=' waiting '/' or '>' or any word as parameter value
          $this->processparvalue=1;
          break;
        case 5:// got any word as parameter value, waiting '/' or '>' or any word as parameter name
          if ($this->parname!="/") {
            unset($this->pars[$this->parname]["single"]);
            $this->pars[$this->parname]["value"]=$word;
            $this->pars[$this->parname]["quot"]=$this->quottype;
          }
          $this->quottype="";
          $this->processpar=$this->processparvalue=0;
          break;
        case 6:// got '/' waiting '>'
          $xmlclose=1;
          break;
        case 8:// got any word as close tag name, waiting '>'
          $this->tagname=$w;
          break;
      }
      $this->prevstate["states"]=$state;
      $this->prevstate["word"]=$word;
    }
    if (strlen($this->text)) $this->AddNewText($this->text);
  }

  function AddNewTag($open,$xmlclose=0) {
    $actionclose=0;
    if (!$open && $this->pg[$this->tagname]["endtag"]!="absent") $actionclose=1;

    if ($open)
      for ($i=$this->stacktagpos;$i>0;$i--) {
        $ct=&$this->stacktag[$i]["tag"];
        $t=&$ct[$ct["contentpos"]];
        $tagname=$t["data"]["name"];
        if (isset($this->pg[$tagname]["closeon"])) {
          if (sizeof($this->pg[$tagname]["closeon"]["in"]) && in_array($this->tagname,$this->pg[$tagname]["closeon"]["in"]) || sizeof($this->pg[$tagname]["closeon"]["notin"]) && !in_array($this->tagname,$this->pg[$tagname]["closeon"]["notin"])) {
            $actionclose=2;
            break;
          }
        }
        if ($actionclose!=2) $i=-1;
      }

    if ($actionclose) {
      if ($actionclose==1) {
        $i=$this->FindTag($this->tagname);
        if ($i>-1)
          if ($this->tagreg[$this->tagname]!=$this->stacktag[$i]["num"])
            $i=-1;
      }
      if ($i>-1) {
        $this->c=&$this->stacktag[$i]["tag"];
        $this->cp=&$this->c["contentpos"];
        $this->stacktagpos=$i;
        if ($actionclose==1) {
          $c=&$this->c[$this->c["contentpos"]]["content"];
          $cp=&$this->c[$this->c["contentpos"]]["content"]["contentpos"];
          $cp++;
          $c[$cp]["type"]="tag";
          $c[$cp]["data"]["name"]=$this->tagname;
          $c[$cp]["data"]["type"]="close";
          if (isset($this->tagreg[$this->tagname]))
            if ($this->tagreg[$this->tagname])
              $this->tagreg[$this->tagname]--;
          $this->stacktag[$this->stacktagpos]["num"]=$this->tagreg[$this->tagname];
          $this->stacktagpos--;
        }
        if ($this->stacktagpos<sizeof($this->stacktag))
          for ($i=$this->stacktagpos+1;$i<sizeof($this->stacktag);$i++)
          unset($stacktag[$i]);
        if ($actionclose==1) return;
      }
    }
    $this->cp++;
    $this->c[$this->c/p]["type"]="tag";
    $this->c[$this->cp]["data"]["name"]=$this->tagname;
    $this->c[$this->cp]["data"]["type"]=($open) ? "open" : "close";
    if (!$open)
      if (isset($this->tagreg[$this->tagname]))
        if ($this->tagreg[$this->tagname])
          $this->tagreg[$this->tagname]--;
    if ($xmlclose) $this->c[$this->cp]["xmlclose"]=1;
    if (sizeof($this->pars)) $this->c[$this->cp]["pars"]=$this->pars;
    if ($open && !$xmlclose && $this->pg[$this->tagname]["endtag"]!="absent") {
      if (!isset($this->tagreg[$this->tagname])) $this->tagreg[$this->tagname]=0;
      $this->tagreg[$this->tagname]++;
      $this->stacktagpos++;
      $this->stacktag[$this->stacktagpos]["tag"]=&$this->c;
      $this->stacktag[$this->stacktagpos]["num"]=$this->tagreg[$this->tagname];
      $this->c[$this->cp]["content"]=array();
      $this->c[$this->cp]["content"]["contentpos"]=-1;
      $this->c=&$this->c[$this->cp]["content"];
      $this->cp=&$this->c["contentpos"];
    }
  }


  function AddNewText($text,$script=0,$comment=0) {
    if (!strlen($text)) return;
    $this->cp++;
    if (!$comment){
      $this->c[$this->cp]["type"]="text";
      $this->c[$this->cp]["ot"]=$this->tagname;
      }
    else
      $this->c[$this->cp]["type"]="comment";
    if ($script) {
      $inputarray=array("/_top/","/top.location.href/","/([ \n]+)?window\.name/","/parent.location/");
      $replarray=array("_echoserver_file_space","parent.frames('_echoserver_file_space').src","//window.name","parent.frames('_echoserver_file_space').src");

      $text=preg_replace($inputarray,$replarray,$text);

    }
    $this->c[$this->cp]["data"]=$text;
    $this->text="";
  }


  function FindTag($tagname) {
    for($i=$this->stacktagpos;$i>=0;$i--)
      if ($this->stacktag[$i]["tag"][$this->stacktag[$i]["tag"]["contentpos"]]["data"]["name"]==$tagname)
        return $i;
    return -1;
  }


function getarr($arr){
$arr['contentpos']=$arr['contentpos']+1;
for ($i=0; $i<$arr['contentpos']; $i++) {
if(is_array($arr[$i]['data'])){
if(((strtolower($arr[$i]['data']['name'])=="td")or(strtolower($arr[$i]['data']['name'])=="div"))and($arr[$i]['data']['type']=="open")){
    $this->arr['arr'][]=$arr[$i]['content'];
    $str=$this->getmytext($arr[$i]['content']);
    $strip=true;
    $tags="<img>";
    //$tags="<p><b><i><em><strong><ul><li><font><span><pre><br>";
    if($strip){
    $this->arr['text'][]=strip_tags($str,$tags);
    }else{
    $this->arr['text'][]=$str;
    }
    $this->arr['tag'][]=$arr[$i]['data']['name'];
    $this->arr['len'][]=strlen($str);
    $this->getarr($arr[$i]['content']);
}else{

$this->getarr($arr[$i]['content']);
}
   }
}

}
 function getmytext($arr){
    $text='';
    $arr['contentpos']=$arr['contentpos']+1;
for ($i=0; $i<$arr['contentpos']; $i++) {
if(($arr[$i]['type']=="text")or(!is_array($arr[$i]['data']))){
if ((preg_match("/[.!?,]/i",$arr[$i]['data']))and(strlen(trim($arr[$i]['data']))>0)){
   if(strlen($arr[$i]['ot'])>0){
       $text.="<".$arr[$i]['ot']."> ".trim(strip_tags($arr[$i]['data'],"<img>"))."</".$arr[$i]['ot'].">\r\n";
       }else{
       $text.=" ".trim(strip_tags($arr[$i]['data'],"<img>"));
       }
       }elseif (($this->issettt($arr[$i]['ot']))and(strlen(trim($arr[$i]['data']),"<img>")>0)){
       $text.=" ".trim(strip_tags($arr[$i]['data'],"<img>"));
       }
   }else{
   if($this->issetm($arr[$i]['data']['name'])){
       $text.=$this->getmytext($arr[$i]['content']);
       }
   }
}
return $text;
}
function issettt($str,$param="none"){

$str=strtolower($str);
if(($str=="b")
    or($str=="i")
    or($str=="strong")
    or($str=="em")
    or($str=="a"))
{
    return true;}else{
    return false;
}
}
function issetm($str,$param="none"){

$str=strtolower($str);
if(($str=="td")
    or($str=="tr")
    or($str=="table")
    or($str=="div")
    or($str=="a")
    or($str=="script"))
{
    return false;}else{
    return true;
}
}
function maxstr(){
$max=0;
$ch=0;
for ($i=0; $i<count($this->arr['len']); $i++) {
$tec=$this->arr['len'][$i];
if($max < $tec){
    $max=$this->arr['len'][$i];
    $ch=$i;
}
}
return $this->arr['text'][$ch];
}
}







function getcontent($text){
$p=new HtmlParser($text,unserialize(file_get_contents("ff1.php")));
$p->Parse();
$arr=&$p->content;
$p->getarr($arr);
$out=$p->maxstr();
unset($p);
return $out;
}
?>

только учти -- если пути к какртинкам будут относительные ( без http:// ... ) то картинок в тексте ты не увидишь ))))

тут код нужно будет дописывать
 
рекомендую посмотреть в сторону
офигенный функционал + куча примеров на все случаи жизни
 
рекомендую посмотреть в сторону *** скрытое содержание ***
офигенный функционал + куча примеров на все случаи жизни

а работает ли он если не известно в каких тэгах текст? и если можно пример плиз. интересует именно парсинг например первой попавшейся страницы с неизвестной структурой
 
если неизвестная структура, как он будет парсить? ии? тогда уже надо самому мозг включать как заставить парсер увидеть в коде большой кусок текста, насколько знаю один из простых методов вырезать теги присущные тексту, и смотреть в каком куске страницы останется больше всего текста
 
Статус
В этой теме нельзя размещать новые ответы.
Назад
Сверху