<?php 
 
/**
 
 * Example using of SimpleCrawler class library
 
 */
 
 
require 'simple_crawler.classes.php';
 
 
$reader = new HtmlReader();
 
 
$page = 'http://falsztyn.boo.pl';
 
//$page = 'http://www.phpclasses.org';
 
 
//read  content from url
 
$html = $reader->getPageContent($page);
 
 
//document content object
 
$htmlDoc = new HtmlDocument($html);
 
 
//document body part object
 
$body = $htmlDoc->getBody();
 
 
//objects array of page links 
 
$links = $body->grabLinks();
 
 
//clean text version of document body object
 
$cleanBody = $body->getStrippedBody();
 
 
//counted words from cleaned document body (word=>count)
 
$words = new BodyWords();
 
$pageWords = $words->findWords($cleanBody->getContent());
 
$words->appendWords($pageWords);
 
 
 
//follow front page links with recursive=1 
 
foreach($links as $link) {
 
    if($link->url == '/') continue;
 
    if($link->type == 1) {
 
        $pageLink = $page.$link->url;
 
    } else {
 
        continue; //no follow external links
 
        //$pageLink = $link->url;
 
    }
 
    $html = $reader->getPageContent($pageLink);
 
    
 
    $htmlDoc = new HtmlDocument($html);
 
    $body = $htmlDoc->getBody();
 
    $cleanBody = $body->getStrippedBody();
 
    
 
    $pageWords = $words->findWords($cleanBody->getContent());
 
    $words->appendWords($pageWords, $link->url);
 
}
 
 
//display words:count per page
 
print_r($words->getWords());
 
//here you may do something with this words
 
 
?>
 
 |