File: simple_crawler_example.php

Recommend this page to a friend!

???

File:	`???`
Role:	Example script
Content typex:	`text/plain`
Description:	example
Class:	Simple Page Crawler Retrieve HTML pages and extract its elements
Author:	By Jacek Lukasiewicz
Last change:
Date:	14 years ago
Size:	`1,382 bytes`

Download


<?php 


/**


 * Example using of SimpleCrawler class library


 */





require 'simple_crawler.classes.php';





$reader = new HtmlReader();





$page = 'http://falsztyn.boo.pl';


//$page = 'http://www.phpclasses.org';





//read  content from url


$html = $reader->getPageContent($page);





//document content object


$htmlDoc = new HtmlDocument($html);





//document body part object


$body = $htmlDoc->getBody();





//objects array of page links 


$links = $body->grabLinks();





//clean text version of document body object


$cleanBody = $body->getStrippedBody();





//counted words from cleaned document body (word=>count)


$words = new BodyWords();


$pageWords = $words->findWords($cleanBody->getContent());


$words->appendWords($pageWords);








//follow front page links with recursive=1 


foreach($links as $link) {


    if($link->url == '/') continue;


    if($link->type == 1) {


        $pageLink = $page.$link->url;


    } else {


        continue; //no follow external links


        //$pageLink = $link->url;


    }


    $html = $reader->getPageContent($pageLink);


    


    $htmlDoc = new HtmlDocument($html);


    $body = $htmlDoc->getBody();


    $cleanBody = $body->getStrippedBody();


    


    $pageWords = $words->findWords($cleanBody->getContent());


    $words->appendWords($pageWords, $link->url);


}





//display words:count per page


print_r($words->getWords());


//here you may do something with this words





?>

About us

Advertise on this site

For more information send a message to info at phpclasses dot org.

File: simple_crawler_example.php

Contents