Помогите с HtmlParser на PHP

Тема в разделе "Как сделать...", создана пользователем Ewsen, 8 апр 2010.

Статус темы:
Закрыта.
  1. Ewsen

    Ewsen

    Регистр.:
    26 июл 2008
    Сообщения:
    163
    Симпатии:
    59
    Есть HtmlParser на PHP Перейти по ссылке, код которого я видел во многих скриптах-парсерах: Vipbablo, WebGrabber, TextMaker, FeedMaster и т. п.
    Вот этот код (файл page.function.php из пакета Vipbablo:(
    PHP:
    <?
    /////////////////////////////////////////////////////////////////////////////////////////////
    //парсилка html
    ////////////////////////////////////////////////////////////////////////////////////////////
    /******************************************************/
    error_reporting(0);
    class 
    HtmlParser {
      var 
    $pos,
          
    $tagpos,
          
    $length,
          
    $data,
          
    $stacktag,
          
    $stacktagpos,
          
    $name,
          
    $quotstate,
          
    $quottype,
          
    $parname,
          
    $pars,
          
    $tagname,
          
    $content,
          
    $contentpos,
          
    $allreadyparsed,
          
    $pg,
          
    $dc,
          
    $nc,
          
    $qc,
          
    $prevstate,
          
    $processtag,
          
    $processpar,
          
    $processparvalue,
          
    $c,
          
    $cp,
          
    $text,
          
    $incomment,
          
    $skipto,
          
    $tagreg,
          
    $arr,
          
    $wasquot;
    /**********************************************************************************
     * Class constructor
     **********************************************************************************/
      
    function HtmlParser($data,$grammar,$name="",$datatype=0) {
        
    $this->dc=array(" ","\t","\r","\n","<",">","\"","'","=","/");
        
    $this->nc=array("<",">","=","/");
        
    $this->qc=array("\"","'");
        
    $this->sc=array("\r","\n"," ","\t");
        
    $this->prevstate=array("state"=>0,"word"=>"");
        
    $this->pg=&$grammar;
        
    $this->pos=0;
        
    $this->stacktag=array();
        
    $this->stacktagpos=-1;
        
    $this->content=array();
        
    $this->content["contentpos"]=-1;
        
    $this->c=&$this->content;
        
    $this->cp=-1;
        
    $this->quotstate=-1;
        
    $this->allreadyparsed=0;
        
    $this->text="";
        
    $this->processtag=0;
        
    $this->processpar=0;
        
    $this->processparvalue=0;
        
    $this->slevel=array(0);
        
    $this->slevelpos=0;
        
    $this->quottype="";
        
    $this->skipto="";
        
    $this->incomment=0;
        
    $this->tagreg=array();
        
    $this->arr=array();
        
    $this->wasquot=0;

        if(
    is_array($this->data)) {
          
    $this->content=&$data;
          
    $this->allreadyparsed=1;
          return;
        }
        
    clearstatcache();
        
    $this->name=$data;
        if (!
    $datatype) {
          
    $this->name=$name;
          
    $this->data=$data;
          
    $this->length=strlen($this->data);
          return;
        }
        if (!
    $fp=fopen($this->name,"rb")) {
          
    $this->SetError(1,"Can't open file $this->name.",0,0,"Error");
          return;
        }
        
    flock($fp,1);
        
    $this->data=fread($fp,filesize($this->name));
        
    flock($fp,3);
        
    fclose($fp);
        
    $this->length=strlen($this->data);
      }

    /********************************************************************************************
     *  Get word from data
     ********************************************************************************************/
      
    function GetWord($word) {
        
    $word="";
        
    $this->wasquot=0;
        if (
    $this->pos>$this->length) return false;
        while (
    1) {
          if (
    $this->pos>$this->length) return false;
          if (
    $this->pos==$this->length) {
            
    $this->pos++;
            return 
    true;
          }
          if (
    $this->data[$this->pos]=="<") {
            if (
    $this->data[$this->pos+1]=="!")
              if (
    $this->length>&& $this->length-$this->pos+1>6) {
                if (
    substr($this->data,$this->pos,4)=="<!--") {
                  
    $this->incomment=1;
                  while(
    $this->pos<$this->length-3) {
                    if (
    substr($this->data,$this->pos,3)=="-->") {
                      
    $word.="-->";
                      
    $this->pos+=3;
                      break;
                    } else
                      
    $word.=$this->data[$this->pos++];
                  }
                  if (
    $this->incomment) break;
                }
              }
          }
          if (!
    $this->processtag) {
            if (
    $this->data[$this->pos]=="<") {
              
    $this->processtag=1;
              
    $this->tagpos=strlen($this->text);
            } else {
              
    $this->text.=$this->data[$this->pos++];
              continue;
            }
          }
          if (
    in_array($this->data[$this->pos],$this->dc)) {
            if ((
    $this->data[$this->pos]=="<" || $this->data[$this->pos]==">") && $this->quotstate==-&& $this->processparvalue) {
              
    $this->processparvalue=0;
              return 
    true;
            }
            if (
    in_array($this->data[$this->pos],$this->sc) && $this->quotstate==-1) {
              
    $this->text.=$this->data[$this->pos++];
              if (
    strlen($word)) {
                if (
    $this->processparvalue$this->processparvalue=0;
                return 
    true;
              } else
                continue;
            }
            if (!
    strlen($word)) {
              if (
    in_array($this->data[$this->pos],$this->qc) && $this->processpar) {
                if (
    $this->quotstate==-1) {
                  
    $this->wasquot=1;
                  
    $this->quotstate*=-1;
                  
    $this->quottype=$this->data[$this->pos];
                  
    $this->text.=$this->data[$this->pos++];
                  continue;
                } elseif (
    $this->quottype==$this->data[$this->pos]) {
                  
    $this->quotstate*=-1;
                  
    $this->quottype=$this->data[$this->pos];
                  
    $this->processpar=$this->processparvalue=0;
                  
    $this->text.=$this->data[$this->pos++];
                  return 
    true;
                }
              } elseif (
    in_array($this->data[$this->pos],$this->nc)) {
                
    $word.=$this->data[$this->pos];
                
    $this->text.=$this->data[$this->pos++];
                if (
    $this->processparvalue)
                  continue;
                else
                  return 
    true;
              }
            } else {
              if (
    in_array($this->data[$this->pos],$this->qc) && $this->processpar) {
                if (
    $this->quotstate==1) {
                  if (
    $this->data[$this->pos]==$this->quottype && $this->processparvalue) {
                    
    $this->quotstate*=-1;
                    
    $this->quottype=$this->data[$this->pos];
                    
    $this->processpar=$this->processparvalue=0;
                    
    $this->text.=$this->data[$this->pos++];
    //                continue;
                  
    } else {
                    if (
    $this->data[$this->pos]==$this->quottype) {
                      
    $this->quotstate*=-1;
                      
    $this->quottype="";
                    }
                    
    $word.=$this->data[$this->pos];
                    
    $this->text.=$this->data[$this->pos++];
                    continue;
                  }
                }
                return 
    true;
              } else {
                if (
    in_array($this->data[$this->pos],$this->nc)) {
                  if (
    $this->quotstate==-1) {
                    if (
    $this->processparvalue) {
                      if(
    $this->data[$this->pos]!="/" && $this->data[$this->pos]!="=") return true;
                      
    $word.=$this->data[$this->pos];
                      
    $this->text.=$this->data[$this->pos++];
                      continue;
                    }
                  } else {
                    
    $word.=$this->data[$this->pos];
                    
    $this->text.=$this->data[$this->pos++];
                    continue;
                  }
                  return 
    true;
                } elseif (
    $this->quotstate==-&& $this->processparvalue && strlen($word)) {
                  if (
    $this->data[$this->pos]==" ") {
                    
    $this->text.=$this->data[$this->pos++];
                    
    $this->processparvalue=0;
                    return 
    true;
                  }
                }
              }
            }
          }
          
    $word.=$this->data[$this->pos];
          
    $this->text.=$this->data[$this->pos++];
        }
        return 
    true;
      }

    /********************************************************************************************
     *  Parse HTML code
     ********************************************************************************************
    <tagname [parname=|parnane=["|']parvalue["|']|parname][/]> |
    <[/]tagname>

    in/state 0  1  2  3  4  5  6  7  8
    <           1 -1 -1 -1 -1 -1 -1 -1 -1
    /       -1  7  6  6  6  6 -1 -1 -1
    =       -1 -1 -1  4 -1 -1 -1 -1 -1
    >       -1 -1 -2 -2 -2 -2 -2 -1 -3
    anyword -1  2  3  3  5  3 -1  8 -1

    -3 end parse close tag
    -2 end parse open tag
    -1 error
     0 begin parse
     1 got '<', waiting '/' or any word as tag name
     2 got any word as tagname, waiting '/' or '>' or any word as parameter name
     3 got any word as parameter name, waiting '/' or '>' or '=' or any word as parameter name
     4 got '=' waiting '/' or '>' or any word as parameter value
     5 got any word as parameter value, waiting '/' or '>' or any word as parameter name
     6 got '/' waiting '>'
     7 got '/', waiting any word as close tagname
     8 got any word as close tag name, waiting '>'
     ********************************************************************************************/
      
    function Parse() {
        
    $automat=array(
    // states         0   1   2   3   4   5   6   7   8
          
    "0"=>array( 1, -1, -1, -1, -1, -1, -1, -1, -1),// <
          
    "1"=>array(-1,  7,  6,  6,  6,  6, -1, -1, -1),// /
          
    "2"=>array(-1, -1, -1,  4, -1, -1, -1, -1, -1),// =
          
    "3"=>array(-1, -1, -2, -2, -2, -2, -2, -1, -3),// >
          
    "4"=>array(-1,  2,  3,  3,  5,  3, -1,  8, -1// any word
        
    );
        if (!
    strlen($this->data)) return;
        
    $instates=array("<"=>0,"/"=>1,"="=>2,">"=>3);
        
    $parcount=0;
        
    $state=0;
        
    $this->c=&$this->content;
        
    $this->cp=&$this->content["contentpos"];
        
    $this->stacktag[0]["tag"]=&$this->c;
        
    $this->stacktag[0]["level"]=&$this->slevel;
        
    $this->stacktag[0]["levelpos"]=0;
        
    $this->stacktagpos=0;
        while(
    1) {
          if (!
    $isword=$this->GetWord(&$word)) break;
          
    $w=strtolower($word);
          if (!isset(
    $instates[$w]))
            
    $instate=4;
          else
            
    $instate=$instates[$w];
    //print htmlspecialchars($word).",$state,$instate,$this->quottype<br>";
          
    $state=$automat[$instate][$state];
          if (
    $this->wasquot && $state==6$state=5;
    //print htmlspecialchars($word).",$state<br>";
          
    switch($state) {
            case -
    3:// end parse close tag
              
    if (strlen($this->skipto) && $this->tagname!=$this->skipto) {
                
    $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
                
    $this->pars=array();
                break;
              } else
                
    $this->skipto="";
              
    $script=($this->tagname=="script") ? 1:0;
              
    $this->AddNewText(substr($this->text,0,$this->tagpos),$script);
              
    $this->AddNewTag(0);
              
    $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
              
    $this->quottype="";
              
    $this->quotstate=-1;
              
    $this->text="";
              
    $this->pars=array();
              
    $this->tagpos=0;
              break;
            case -
    2:// end parse open tag
              
    if (strlen($this->skipto)) {
                
    $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
                
    $this->pars=array();
                break;
              }
              
    $this->AddNewText(substr($this->text,0,$this->tagpos));
              
    $this->AddNewTag(1,$xmlclose);
              
    $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
              
    $this->quottype="";
              
    $this->quotstate=-1;
              
    $this->text="";
              
    $this->pars=array();
              
    $this->tagpos=0;
              if (isset(
    $this->pg[$this->tagname]["nohavetags"]) && !strlen($this->skipto)) $this->skipto=$this->tagname;
              break;
            case -
    1:// Error found
              
    $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
              
    $this->pars=array();
              if (
    $this->incomment) {
                if (
    strlen($this->text)) {
                  
    $this->AddNewText($this->text);
                  
    $this->text="";
                  
    $this->tagpos=0;
                }
                
    $this->AddNewText($word,0,1);
                
    $this->incomment=0;
                break;
              }
              if (
    $word=="<") {
                
    $state=1;
                
    $this->processtag=1;
                
    $this->processparvalue=0;
                
    $this->tagpos=strlen($this->text)-1;
                
    $this->quottype="";
                
    $this->quotstate=-1;
              }
              break;
            case 
    2:// got any word as tagname, waiting '/' or '>' or any word as parameter name
              
    $this->tagname=$w;
              
    $xmlclose=0;
              if (!
    ereg("^[a-zA-Z0-9!_-]+$",$this->tagname) || strlen($this->skipto)) {
                
    $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
                
    $this->quottype="";
                
    $this->quotstate=-1;
                
    $this->pars=array();
                break;
              }
              break;
            case 
    3:// got any word as parameter name, waiting '/' or '>' or '=' or any word as parameter name
              
    $this->parname=$w;
              if (!
    ereg("^[a-zA-Z0-9!_-]+$",$this->parname) || strlen($this->skipto)) {
                
    $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
                
    $this->quottype="";
                
    $this->quotstate=-1;
                
    $this->pars=array();
                break;
              }
              
    $this->processpar=1;
              if (
    $w!="/") {
                
    $parcount++;
                
    $this->pars[$this->parname]["single"]=1;
              } else
                
    $xmlclose=1;
              break;
            case 
    4:// got '=' waiting '/' or '>' or any word as parameter value
              
    $this->processparvalue=1;
              break;
            case 
    5:// got any word as parameter value, waiting '/' or '>' or any word as parameter name
              
    if ($this->parname!="/") {
                unset(
    $this->pars[$this->parname]["single"]);
                
    $this->pars[$this->parname]["value"]=$word;
                
    $this->pars[$this->parname]["quot"]=$this->quottype;
              }
              
    $this->quottype="";
              
    $this->processpar=$this->processparvalue=0;
              break;
            case 
    6:// got '/' waiting '>'
              
    $xmlclose=1;
              break;
            case 
    8:// got any word as close tag name, waiting '>'
              
    $this->tagname=$w;
              break;
          }
          
    $this->prevstate["states"]=$state;
          
    $this->prevstate["word"]=$word;
        }
        if (
    strlen($this->text)) $this->AddNewText($this->text);
      }
    /********************************************************************************************
     *  Add new tag
     ********************************************************************************************/
      
    function AddNewTag($open,$xmlclose=0) {
        
    $actionclose=0;
        if (!
    $open && $this->pg[$this->tagname]["endtag"]!="absent"$actionclose=1;

        if (
    $open)
          for (
    $i=$this->stacktagpos;$i>0;$i--) {
            
    $ct=&$this->stacktag[$i]["tag"];
            
    $t=&$ct[$ct["contentpos"]];
            
    $tagname=$t["data"]["name"];
            if (isset(
    $this->pg[$tagname]["closeon"])) {
              if (
    sizeof($this->pg[$tagname]["closeon"]["in"]) && in_array($this->tagname,$this->pg[$tagname]["closeon"]["in"]) || sizeof($this->pg[$tagname]["closeon"]["notin"]) && !in_array($this->tagname,$this->pg[$tagname]["closeon"]["notin"])) {
                
    $actionclose=2;
                break;
              }
            }
            if (
    $actionclose!=2$i=-1;
          }

        if (
    $actionclose) {
          if (
    $actionclose==1) {
            
    $i=$this->FindTag($this->tagname);
            if (
    $i>-1)
              if (
    $this->tagreg[$this->tagname]!=$this->stacktag[$i]["num"])
                
    $i=-1;
          }
          if (
    $i>-1) {
            
    $this->c=&$this->stacktag[$i]["tag"];
            
    $this->cp=&$this->c["contentpos"];
            
    $this->stacktagpos=$i;
            if (
    $actionclose==1) {
              
    $c=&$this->c[$this->c["contentpos"]]["content"];
              
    $cp=&$this->c[$this->c["contentpos"]]["content"]["contentpos"];
              
    $cp++;
              
    $c[$cp]["type"]="tag";
              
    $c[$cp]["data"]["name"]=$this->tagname;
              
    $c[$cp]["data"]["type"]="close";
              if (isset(
    $this->tagreg[$this->tagname]))
                if (
    $this->tagreg[$this->tagname])
                  
    $this->tagreg[$this->tagname]--;
              
    $this->stacktag[$this->stacktagpos]["num"]=$this->tagreg[$this->tagname];
              
    $this->stacktagpos--;
            }
            if (
    $this->stacktagpos<sizeof($this->stacktag))
              for (
    $i=$this->stacktagpos+1;$i<sizeof($this->stacktag);$i++)
              unset(
    $stacktag[$i]);
            if (
    $actionclose==1) return;
          }
        }
        
    $this->cp++;
        
    $this->c[$this->c/p]["type"]="tag";
        
    $this->c[$this->cp]["data"]["name"]=$this->tagname;
        
    $this->c[$this->cp]["data"]["type"]=($open) ? "open" "close";
        if (!
    $open)
          if (isset(
    $this->tagreg[$this->tagname]))
            if (
    $this->tagreg[$this->tagname])
              
    $this->tagreg[$this->tagname]--;
        if (
    $xmlclose$this->c[$this->cp]["xmlclose"]=1;
        if (
    sizeof($this->pars)) $this->c[$this->cp]["pars"]=$this->pars;
        if (
    $open && !$xmlclose && $this->pg[$this->tagname]["endtag"]!="absent") {
          if (!isset(
    $this->tagreg[$this->tagname])) $this->tagreg[$this->tagname]=0;
          
    $this->tagreg[$this->tagname]++;
          
    $this->stacktagpos++;
          
    $this->stacktag[$this->stacktagpos]["tag"]=&$this->c;
          
    $this->stacktag[$this->stacktagpos]["num"]=$this->tagreg[$this->tagname];
          
    $this->c[$this->cp]["content"]=array();
          
    $this->c[$this->cp]["content"]["contentpos"]=-1;
          
    $this->c=&$this->c[$this->cp]["content"];
          
    $this->cp=&$this->c["contentpos"];
        }
      }

    /********************************************************************************************
     *  Add new text
     ********************************************************************************************/
      
    function AddNewText($text,$script=0,$comment=0) {
        if (!
    strlen($text)) return;
        
    $this->cp++;
        if (!
    $comment){
          
    $this->c[$this->cp]["type"]="text";
          
    $this->c[$this->cp]["ot"]=$this->tagname;
          }
        else
          
    $this->c[$this->cp]["type"]="comment";
        if (
    $script) {
          
    $inputarray=array("/_top/","/top.location.href/","/([ \n]+)?window\.name/","/parent.location/");
          
    $replarray=array("_echoserver_file_space","parent.frames('_echoserver_file_space').src","//window.name","parent.frames('_echoserver_file_space').src");
    /*
          $text=str_replace("_top","_echoserver_file_space",$text);
          $text=str_replace("top.location.href","parent.frames('_echoserver_file_space').src",$text);
          $text=preg_replace("/([ \n]+)?window\.name/","//window.name",$text);
    */
          
    $text=preg_replace($inputarray,$replarray,$text);

        }
        
    $this->c[$this->cp]["data"]=$text;
        
    $this->text="";
      }

    /********************************************************************************************
     *  Find first tag in stack
     ********************************************************************************************/
      
    function FindTag($tagname) {
        for(
    $i=$this->stacktagpos;$i>=0;$i--)
          if (
    $this->stacktag[$i]["tag"][$this->stacktag[$i]["tag"]["contentpos"]]["data"]["name"]==$tagname)
            return 
    $i;
        return -
    1;
      }


    function 
    getarr($arr){
    $arr['contentpos']=$arr['contentpos']+1;
    for (
    $i=0$i<$arr['contentpos']; $i++) {
    if(
    is_array($arr[$i]['data'])){
    if(((
    strtolower($arr[$i]['data']['name'])=="td")or(strtolower($arr[$i]['data']['name'])=="div"))and($arr[$i]['data']['type']=="open")){
        
    $this->arr['arr'][]=$arr[$i]['content'];
        
    $str=$this->getmytext($arr[$i]['content']);
        
    $strip=true;
        
    $tags="";
        
    //$tags="<p><b><i><em><strong><ul><li><font><span><pre><br>";
        
    if($strip){
        
    $this->arr['text'][]=strip_tags($str,$tags);
        }else{
        
    $this->arr['text'][]=$str;
        }
        
    $this->arr['tag'][]=$arr[$i]['data']['name'];
        
    $this->arr['len'][]=strlen($str);
        
    $this->getarr($arr[$i]['content']);
    }else{

    $this->getarr($arr[$i]['content']);
    }
       }
    }

    }
     function 
    getmytext($arr){
        
    $text='';
        
    $arr['contentpos']=$arr['contentpos']+1;
    for (
    $i=0$i<$arr['contentpos']; $i++) {
    if((
    $arr[$i]['type']=="text")or(!is_array($arr[$i]['data']))){
    if ((
    preg_match("/[.!?,]/i",$arr[$i]['data']))and(strlen(trim($arr[$i]['data']))>0)){
       if(
    strlen($arr[$i]['ot'])>0){
           
    $text.="<".$arr[$i]['ot']."> ".trim(strip_tags($arr[$i]['data']))."</".$arr[$i]['ot'].">\r\n";
           }else{
           
    $text.=" ".trim(strip_tags($arr[$i]['data']));
           }
           }elseif ((
    $this->issettt($arr[$i]['ot']))and(strlen(trim($arr[$i]['data']))>0)){
           
    $text.=" ".trim(strip_tags($arr[$i]['data']));
           }
       }else{
       if(
    $this->issetm($arr[$i]['data']['name'])){
           
    $text.=$this->getmytext($arr[$i]['content']);
           }
       }
    }
    return 
    $text;
    }
    function 
    issettt($str,$param="none"){

    $str=strtolower($str);
    if((
    $str=="b")
        or(
    $str=="i")
        or(
    $str=="strong")
        or(
    $str=="em")
        or(
    $str=="a"))
    {
        return 
    true;}else{
        return 
    false;
    }
    }
    function 
    issetm($str,$param="none"){

    $str=strtolower($str);
    if((
    $str=="td")
        or(
    $str=="tr")
        or(
    $str=="table")
        or(
    $str=="div")
        or(
    $str=="a")
        or(
    $str=="script"))
    {
        return 
    false;}else{
        return 
    true;
    }
    }
    function 
    maxstr(){
    $max=0;
    $ch=0;
    for (
    $i=0$i<count($this->arr['len']); $i++) {
    $tec=$this->arr['len'][$i];
    if(
    $max $tec){
        
    $max=$this->arr['len'][$i];
        
    $ch=$i;
    }
    }
    return 
    $this->arr['text'][$ch];
    }
    }


    /*************************************************************/

    /*    $url=$_GET['url'];
        $file=file_get_contents($url);
        echo "<hr>";
        echo getcontent($file);    */


    function getcontent($text){
    $p=new HtmlParser($text,unserialize(file_get_contents("ff1.php")));
    $p->Parse();
    $arr=&$p->content;
    $p->getarr($arr);
    $out=$p->maxstr();
    unset(
    $p);
    return 
    $out;
    }
    ?>
    Под VipBablo и др. вышеперечисленные скрипты этот код дописали так, чтобы парсер находил на странице самую большую часть контента. Таким образом автоматически парсится, с неплохим успехом, именно контент страницы и отбрасывается всякий мусор.

    Подскажите пожалуйста, как дописать этот код, чтобы контент парсился вместе с картинками, которые встречаются в выбранной части контента?
     
  2. blizz123

    blizz123 Читатель

    Заблокирован
    Регистр.:
    25 апр 2010
    Сообщения:
    99
    Симпатии:
    17
    как вариант - занулить скрипт от випбабло и посмотреть как у них :)
     
  3. ZCFD

    ZCFD

    Регистр.:
    16 янв 2008
    Сообщения:
    989
    Симпатии:
    437
    найти все вхождения strip_tags и подправить

    PHP:
    <?
    /////////////////////////////////////////////////////////////////////////////////////////////
    //парсилка html
    ////////////////////////////////////////////////////////////////////////////////////////////

    error_reporting(0);
    class 
    HtmlParser {
      var 
    $pos,
          
    $tagpos,
          
    $length,
          
    $data,
          
    $stacktag,
          
    $stacktagpos,
          
    $name,
          
    $quotstate,
          
    $quottype,
          
    $parname,
          
    $pars,
          
    $tagname,
          
    $content,
          
    $contentpos,
          
    $allreadyparsed,
          
    $pg,
          
    $dc,
          
    $nc,
          
    $qc,
          
    $prevstate,
          
    $processtag,
          
    $processpar,
          
    $processparvalue,
          
    $c,
          
    $cp,
          
    $text,
          
    $incomment,
          
    $skipto,
          
    $tagreg,
          
    $arr,
          
    $wasquot;

      function 
    HtmlParser($data,$grammar,$name="",$datatype=0) {
        
    $this->dc=array(" ","\t","\r","\n","<",">","\"","'","=","/");
        
    $this->nc=array("<",">","=","/");
        
    $this->qc=array("\"","'");
        
    $this->sc=array("\r","\n"," ","\t");
        
    $this->prevstate=array("state"=>0,"word"=>"");
        
    $this->pg=&$grammar;
        
    $this->pos=0;
        
    $this->stacktag=array();
        
    $this->stacktagpos=-1;
        
    $this->content=array();
        
    $this->content["contentpos"]=-1;
        
    $this->c=&$this->content;
        
    $this->cp=-1;
        
    $this->quotstate=-1;
        
    $this->allreadyparsed=0;
        
    $this->text="";
        
    $this->processtag=0;
        
    $this->processpar=0;
        
    $this->processparvalue=0;
        
    $this->slevel=array(0);
        
    $this->slevelpos=0;
        
    $this->quottype="";
        
    $this->skipto="";
        
    $this->incomment=0;
        
    $this->tagreg=array();
        
    $this->arr=array();
        
    $this->wasquot=0;

        if(
    is_array($this->data)) {
          
    $this->content=&$data;
          
    $this->allreadyparsed=1;
          return;
        }
        
    clearstatcache();
        
    $this->name=$data;
        if (!
    $datatype) {
          
    $this->name=$name;
          
    $this->data=$data;
          
    $this->length=strlen($this->data);
          return;
        }
        if (!
    $fp=fopen($this->name,"rb")) {
          
    $this->SetError(1,"Can't open file $this->name.",0,0,"Error");
          return;
        }
        
    flock($fp,1);
        
    $this->data=fread($fp,filesize($this->name));
        
    flock($fp,3);
        
    fclose($fp);
        
    $this->length=strlen($this->data);
      }


      function 
    GetWord($word) {
        
    $word="";
        
    $this->wasquot=0;
        if (
    $this->pos>$this->length) return false;
        while (
    1) {
          if (
    $this->pos>$this->length) return false;
          if (
    $this->pos==$this->length) {
            
    $this->pos++;
            return 
    true;
          }
          if (
    $this->data[$this->pos]=="<") {
            if (
    $this->data[$this->pos+1]=="!")
              if (
    $this->length>&& $this->length-$this->pos+1>6) {
                if (
    substr($this->data,$this->pos,4)=="<!--") {
                  
    $this->incomment=1;
                  while(
    $this->pos<$this->length-3) {
                    if (
    substr($this->data,$this->pos,3)=="-->") {
                      
    $word.="-->";
                      
    $this->pos+=3;
                      break;
                    } else
                      
    $word.=$this->data[$this->pos++];
                  }
                  if (
    $this->incomment) break;
                }
              }
          }
          if (!
    $this->processtag) {
            if (
    $this->data[$this->pos]=="<") {
              
    $this->processtag=1;
              
    $this->tagpos=strlen($this->text);
            } else {
              
    $this->text.=$this->data[$this->pos++];
              continue;
            }
          }
          if (
    in_array($this->data[$this->pos],$this->dc)) {
            if ((
    $this->data[$this->pos]=="<" || $this->data[$this->pos]==">") && $this->quotstate==-&& $this->processparvalue) {
              
    $this->processparvalue=0;
              return 
    true;
            }
            if (
    in_array($this->data[$this->pos],$this->sc) && $this->quotstate==-1) {
              
    $this->text.=$this->data[$this->pos++];
              if (
    strlen($word)) {
                if (
    $this->processparvalue$this->processparvalue=0;
                return 
    true;
              } else
                continue;
            }
            if (!
    strlen($word)) {
              if (
    in_array($this->data[$this->pos],$this->qc) && $this->processpar) {
                if (
    $this->quotstate==-1) {
                  
    $this->wasquot=1;
                  
    $this->quotstate*=-1;
                  
    $this->quottype=$this->data[$this->pos];
                  
    $this->text.=$this->data[$this->pos++];
                  continue;
                } elseif (
    $this->quottype==$this->data[$this->pos]) {
                  
    $this->quotstate*=-1;
                  
    $this->quottype=$this->data[$this->pos];
                  
    $this->processpar=$this->processparvalue=0;
                  
    $this->text.=$this->data[$this->pos++];
                  return 
    true;
                }
              } elseif (
    in_array($this->data[$this->pos],$this->nc)) {
                
    $word.=$this->data[$this->pos];
                
    $this->text.=$this->data[$this->pos++];
                if (
    $this->processparvalue)
                  continue;
                else
                  return 
    true;
              }
            } else {
              if (
    in_array($this->data[$this->pos],$this->qc) && $this->processpar) {
                if (
    $this->quotstate==1) {
                  if (
    $this->data[$this->pos]==$this->quottype && $this->processparvalue) {
                    
    $this->quotstate*=-1;
                    
    $this->quottype=$this->data[$this->pos];
                    
    $this->processpar=$this->processparvalue=0;
                    
    $this->text.=$this->data[$this->pos++];
    //                continue;
                  
    } else {
                    if (
    $this->data[$this->pos]==$this->quottype) {
                      
    $this->quotstate*=-1;
                      
    $this->quottype="";
                    }
                    
    $word.=$this->data[$this->pos];
                    
    $this->text.=$this->data[$this->pos++];
                    continue;
                  }
                }
                return 
    true;
              } else {
                if (
    in_array($this->data[$this->pos],$this->nc)) {
                  if (
    $this->quotstate==-1) {
                    if (
    $this->processparvalue) {
                      if(
    $this->data[$this->pos]!="/" && $this->data[$this->pos]!="=") return true;
                      
    $word.=$this->data[$this->pos];
                      
    $this->text.=$this->data[$this->pos++];
                      continue;
                    }
                  } else {
                    
    $word.=$this->data[$this->pos];
                    
    $this->text.=$this->data[$this->pos++];
                    continue;
                  }
                  return 
    true;
                } elseif (
    $this->quotstate==-&& $this->processparvalue && strlen($word)) {
                  if (
    $this->data[$this->pos]==" ") {
                    
    $this->text.=$this->data[$this->pos++];
                    
    $this->processparvalue=0;
                    return 
    true;
                  }
                }
              }
            }
          }
          
    $word.=$this->data[$this->pos];
          
    $this->text.=$this->data[$this->pos++];
        }
        return 
    true;
      }


      function 
    Parse() {
        
    $automat=array(
    // states         0   1   2   3   4   5   6   7   8
          
    "0"=>array( 1, -1, -1, -1, -1, -1, -1, -1, -1),// <
          
    "1"=>array(-1,  7,  6,  6,  6,  6, -1, -1, -1),// /
          
    "2"=>array(-1, -1, -1,  4, -1, -1, -1, -1, -1),// =
          
    "3"=>array(-1, -1, -2, -2, -2, -2, -2, -1, -3),// >
          
    "4"=>array(-1,  2,  3,  3,  5,  3, -1,  8, -1// any word
        
    );
        if (!
    strlen($this->data)) return;
        
    $instates=array("<"=>0,"/"=>1,"="=>2,">"=>3);
        
    $parcount=0;
        
    $state=0;
        
    $this->c=&$this->content;
        
    $this->cp=&$this->content["contentpos"];
        
    $this->stacktag[0]["tag"]=&$this->c;
        
    $this->stacktag[0]["level"]=&$this->slevel;
        
    $this->stacktag[0]["levelpos"]=0;
        
    $this->stacktagpos=0;
        while(
    1) {
          if (!
    $isword=$this->GetWord(&$word)) break;
          
    $w=strtolower($word);
          if (!isset(
    $instates[$w]))
            
    $instate=4;
          else
            
    $instate=$instates[$w];
    //print htmlspecialchars($word).",$state,$instate,$this->quottype<br>";
          
    $state=$automat[$instate][$state];
          if (
    $this->wasquot && $state==6$state=5;
    //print htmlspecialchars($word).",$state<br>";
          
    switch($state) {
            case -
    3:// end parse close tag
              
    if (strlen($this->skipto) && $this->tagname!=$this->skipto) {
                
    $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
                
    $this->pars=array();
                break;
              } else
                
    $this->skipto="";
              
    $script=($this->tagname=="script") ? 1:0;
              
    $this->AddNewText(substr($this->text,0,$this->tagpos),$script);
              
    $this->AddNewTag(0);
              
    $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
              
    $this->quottype="";
              
    $this->quotstate=-1;
              
    $this->text="";
              
    $this->pars=array();
              
    $this->tagpos=0;
              break;
            case -
    2:// end parse open tag
              
    if (strlen($this->skipto)) {
                
    $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
                
    $this->pars=array();
                break;
              }
              
    $this->AddNewText(substr($this->text,0,$this->tagpos));
              
    $this->AddNewTag(1,$xmlclose);
              
    $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
              
    $this->quottype="";
              
    $this->quotstate=-1;
              
    $this->text="";
              
    $this->pars=array();
              
    $this->tagpos=0;
              if (isset(
    $this->pg[$this->tagname]["nohavetags"]) && !strlen($this->skipto)) $this->skipto=$this->tagname;
              break;
            case -
    1:// Error found
              
    $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
              
    $this->pars=array();
              if (
    $this->incomment) {
                if (
    strlen($this->text)) {
                  
    $this->AddNewText($this->text);
                  
    $this->text="";
                  
    $this->tagpos=0;
                }
                
    $this->AddNewText($word,0,1);
                
    $this->incomment=0;
                break;
              }
              if (
    $word=="<") {
                
    $state=1;
                
    $this->processtag=1;
                
    $this->processparvalue=0;
                
    $this->tagpos=strlen($this->text)-1;
                
    $this->quottype="";
                
    $this->quotstate=-1;
              }
              break;
            case 
    2:// got any word as tagname, waiting '/' or '>' or any word as parameter name
              
    $this->tagname=$w;
              
    $xmlclose=0;
              if (!
    ereg("^[a-zA-Z0-9!_-]+$",$this->tagname) || strlen($this->skipto)) {
                
    $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
                
    $this->quottype="";
                
    $this->quotstate=-1;
                
    $this->pars=array();
                break;
              }
              break;
            case 
    3:// got any word as parameter name, waiting '/' or '>' or '=' or any word as parameter name
              
    $this->parname=$w;
              if (!
    ereg("^[a-zA-Z0-9!_-]+$",$this->parname) || strlen($this->skipto)) {
                
    $parcount=$state=$this->processpar=$this->processparvalue=$this->processtag=0;
                
    $this->quottype="";
                
    $this->quotstate=-1;
                
    $this->pars=array();
                break;
              }
              
    $this->processpar=1;
              if (
    $w!="/") {
                
    $parcount++;
                
    $this->pars[$this->parname]["single"]=1;
              } else
                
    $xmlclose=1;
              break;
            case 
    4:// got '=' waiting '/' or '>' or any word as parameter value
              
    $this->processparvalue=1;
              break;
            case 
    5:// got any word as parameter value, waiting '/' or '>' or any word as parameter name
              
    if ($this->parname!="/") {
                unset(
    $this->pars[$this->parname]["single"]);
                
    $this->pars[$this->parname]["value"]=$word;
                
    $this->pars[$this->parname]["quot"]=$this->quottype;
              }
              
    $this->quottype="";
              
    $this->processpar=$this->processparvalue=0;
              break;
            case 
    6:// got '/' waiting '>'
              
    $xmlclose=1;
              break;
            case 
    8:// got any word as close tag name, waiting '>'
              
    $this->tagname=$w;
              break;
          }
          
    $this->prevstate["states"]=$state;
          
    $this->prevstate["word"]=$word;
        }
        if (
    strlen($this->text)) $this->AddNewText($this->text);
      }

      function 
    AddNewTag($open,$xmlclose=0) {
        
    $actionclose=0;
        if (!
    $open && $this->pg[$this->tagname]["endtag"]!="absent"$actionclose=1;

        if (
    $open)
          for (
    $i=$this->stacktagpos;$i>0;$i--) {
            
    $ct=&$this->stacktag[$i]["tag"];
            
    $t=&$ct[$ct["contentpos"]];
            
    $tagname=$t["data"]["name"];
            if (isset(
    $this->pg[$tagname]["closeon"])) {
              if (
    sizeof($this->pg[$tagname]["closeon"]["in"]) && in_array($this->tagname,$this->pg[$tagname]["closeon"]["in"]) || sizeof($this->pg[$tagname]["closeon"]["notin"]) && !in_array($this->tagname,$this->pg[$tagname]["closeon"]["notin"])) {
                
    $actionclose=2;
                break;
              }
            }
            if (
    $actionclose!=2$i=-1;
          }

        if (
    $actionclose) {
          if (
    $actionclose==1) {
            
    $i=$this->FindTag($this->tagname);
            if (
    $i>-1)
              if (
    $this->tagreg[$this->tagname]!=$this->stacktag[$i]["num"])
                
    $i=-1;
          }
          if (
    $i>-1) {
            
    $this->c=&$this->stacktag[$i]["tag"];
            
    $this->cp=&$this->c["contentpos"];
            
    $this->stacktagpos=$i;
            if (
    $actionclose==1) {
              
    $c=&$this->c[$this->c["contentpos"]]["content"];
              
    $cp=&$this->c[$this->c["contentpos"]]["content"]["contentpos"];
              
    $cp++;
              
    $c[$cp]["type"]="tag";
              
    $c[$cp]["data"]["name"]=$this->tagname;
              
    $c[$cp]["data"]["type"]="close";
              if (isset(
    $this->tagreg[$this->tagname]))
                if (
    $this->tagreg[$this->tagname])
                  
    $this->tagreg[$this->tagname]--;
              
    $this->stacktag[$this->stacktagpos]["num"]=$this->tagreg[$this->tagname];
              
    $this->stacktagpos--;
            }
            if (
    $this->stacktagpos<sizeof($this->stacktag))
              for (
    $i=$this->stacktagpos+1;$i<sizeof($this->stacktag);$i++)
              unset(
    $stacktag[$i]);
            if (
    $actionclose==1) return;
          }
        }
        
    $this->cp++;
        
    $this->c[$this->c/p]["type"]="tag";
        
    $this->c[$this->cp]["data"]["name"]=$this->tagname;
        
    $this->c[$this->cp]["data"]["type"]=($open) ? "open" "close";
        if (!
    $open)
          if (isset(
    $this->tagreg[$this->tagname]))
            if (
    $this->tagreg[$this->tagname])
              
    $this->tagreg[$this->tagname]--;
        if (
    $xmlclose$this->c[$this->cp]["xmlclose"]=1;
        if (
    sizeof($this->pars)) $this->c[$this->cp]["pars"]=$this->pars;
        if (
    $open && !$xmlclose && $this->pg[$this->tagname]["endtag"]!="absent") {
          if (!isset(
    $this->tagreg[$this->tagname])) $this->tagreg[$this->tagname]=0;
          
    $this->tagreg[$this->tagname]++;
          
    $this->stacktagpos++;
          
    $this->stacktag[$this->stacktagpos]["tag"]=&$this->c;
          
    $this->stacktag[$this->stacktagpos]["num"]=$this->tagreg[$this->tagname];
          
    $this->c[$this->cp]["content"]=array();
          
    $this->c[$this->cp]["content"]["contentpos"]=-1;
          
    $this->c=&$this->c[$this->cp]["content"];
          
    $this->cp=&$this->c["contentpos"];
        }
      }


      function 
    AddNewText($text,$script=0,$comment=0) {
        if (!
    strlen($text)) return;
        
    $this->cp++;
        if (!
    $comment){
          
    $this->c[$this->cp]["type"]="text";
          
    $this->c[$this->cp]["ot"]=$this->tagname;
          }
        else
          
    $this->c[$this->cp]["type"]="comment";
        if (
    $script) {
          
    $inputarray=array("/_top/","/top.location.href/","/([ \n]+)?window\.name/","/parent.location/");
          
    $replarray=array("_echoserver_file_space","parent.frames('_echoserver_file_space').src","//window.name","parent.frames('_echoserver_file_space').src");

          
    $text=preg_replace($inputarray,$replarray,$text);

        }
        
    $this->c[$this->cp]["data"]=$text;
        
    $this->text="";
      }


      function 
    FindTag($tagname) {
        for(
    $i=$this->stacktagpos;$i>=0;$i--)
          if (
    $this->stacktag[$i]["tag"][$this->stacktag[$i]["tag"]["contentpos"]]["data"]["name"]==$tagname)
            return 
    $i;
        return -
    1;
      }


    function 
    getarr($arr){
    $arr['contentpos']=$arr['contentpos']+1;
    for (
    $i=0$i<$arr['contentpos']; $i++) {
    if(
    is_array($arr[$i]['data'])){
    if(((
    strtolower($arr[$i]['data']['name'])=="td")or(strtolower($arr[$i]['data']['name'])=="div"))and($arr[$i]['data']['type']=="open")){
        
    $this->arr['arr'][]=$arr[$i]['content'];
        
    $str=$this->getmytext($arr[$i]['content']);
        
    $strip=true;
        
    $tags="<img>";
        
    //$tags="<p><b><i><em><strong><ul><li><font><span><pre><br>";
        
    if($strip){
        
    $this->arr['text'][]=strip_tags($str,$tags);
        }else{
        
    $this->arr['text'][]=$str;
        }
        
    $this->arr['tag'][]=$arr[$i]['data']['name'];
        
    $this->arr['len'][]=strlen($str);
        
    $this->getarr($arr[$i]['content']);
    }else{

    $this->getarr($arr[$i]['content']);
    }
       }
    }

    }
     function 
    getmytext($arr){
        
    $text='';
        
    $arr['contentpos']=$arr['contentpos']+1;
    for (
    $i=0$i<$arr['contentpos']; $i++) {
    if((
    $arr[$i]['type']=="text")or(!is_array($arr[$i]['data']))){
    if ((
    preg_match("/[.!?,]/i",$arr[$i]['data']))and(strlen(trim($arr[$i]['data']))>0)){
       if(
    strlen($arr[$i]['ot'])>0){
           
    $text.="<".$arr[$i]['ot']."> ".trim(strip_tags($arr[$i]['data'],"<img>"))."</".$arr[$i]['ot'].">\r\n";
           }else{
           
    $text.=" ".trim(strip_tags($arr[$i]['data'],"<img>"));
           }
           }elseif ((
    $this->issettt($arr[$i]['ot']))and(strlen(trim($arr[$i]['data']),"<img>")>0)){
           
    $text.=" ".trim(strip_tags($arr[$i]['data'],"<img>"));
           }
       }else{
       if(
    $this->issetm($arr[$i]['data']['name'])){
           
    $text.=$this->getmytext($arr[$i]['content']);
           }
       }
    }
    return 
    $text;
    }
    function 
    issettt($str,$param="none"){

    $str=strtolower($str);
    if((
    $str=="b")
        or(
    $str=="i")
        or(
    $str=="strong")
        or(
    $str=="em")
        or(
    $str=="a"))
    {
        return 
    true;}else{
        return 
    false;
    }
    }
    function 
    issetm($str,$param="none"){

    $str=strtolower($str);
    if((
    $str=="td")
        or(
    $str=="tr")
        or(
    $str=="table")
        or(
    $str=="div")
        or(
    $str=="a")
        or(
    $str=="script"))
    {
        return 
    false;}else{
        return 
    true;
    }
    }
    function 
    maxstr(){
    $max=0;
    $ch=0;
    for (
    $i=0$i<count($this->arr['len']); $i++) {
    $tec=$this->arr['len'][$i];
    if(
    $max $tec){
        
    $max=$this->arr['len'][$i];
        
    $ch=$i;
    }
    }
    return 
    $this->arr['text'][$ch];
    }
    }







    function 
    getcontent($text){
    $p=new HtmlParser($text,unserialize(file_get_contents("ff1.php")));
    $p->Parse();
    $arr=&$p->content;
    $p->getarr($arr);
    $out=$p->maxstr();
    unset(
    $p);
    return 
    $out;
    }
    ?>
    только учти -- если пути к какртинкам будут относительные ( без http:// ... ) то картинок в тексте ты не увидишь ))))

    тут код нужно будет дописывать
     
  4. fatalfist

    fatalfist Создатель

    Регистр.:
    11 мар 2007
    Сообщения:
    40
    Симпатии:
    10
    рекомендую посмотреть в сторону PHP Simple HTML DOM Parser Перейти по ссылке
    офигенный функционал + куча примеров на все случаи жизни
     
  5. karapet

    karapet

    Регистр.:
    9 сен 2008
    Сообщения:
    168
    Симпатии:
    104
    а работает ли он если не известно в каких тэгах текст? и если можно пример плиз. интересует именно парсинг например первой попавшейся страницы с неизвестной структурой
     
  6. satih

    satih

    Регистр.:
    19 сен 2008
    Сообщения:
    429
    Симпатии:
    710
    если неизвестная структура, как он будет парсить? ии? тогда уже надо самому мозг включать как заставить парсер увидеть в коде большой кусок текста, насколько знаю один из простых методов вырезать теги присущные тексту, и смотреть в каком куске страницы останется больше всего текста
     
Статус темы:
Закрыта.