Zend - The PHP Company




Searching and Trees

Add Code


ananse  

Type: application
Added by: mynelis
Entered: 13/06/2005
Last modified: 02/12/2005
Rating: - (fewer than 3 votes)
Views: 5528
ananse is web crawler that collects links from a website, crawls them and indexes the data into a database for use in searching. This version "ananse v1.1" is the second experimental version of the real "ananse" project. Developed by Devonet Multimedia, Accra, Ghana. http://www.devonet.com


/*
    Ananse v1.1 is still under development. This is just
    the second experimental module of the "ananse" project.
    Developed by Cornelius Duhadzi, CEO and Creative Director
    of Devonet Multimedia, Accra, Ghana. http://www.devonet.com
    
    Download ananse v1.1 at: http://www.devonet.com/free.
    Send your comments, questions and suggestions to
    mynelis@gmail.com
*/

SET UP THE DATABASE
======================================================
    Create a database named "mycrawler"

    Create a table called "ananse" in the "mycrawler" as below:

    CREATE TABLE `ananse` (
      `acID` bigint(20) NOT NULL auto_increment,
      `acURL` varchar(250) default NULL,
      `acMetaDes` text NOT NULL,
      `acMetaKey` text NOT NULL,
      `acLink` text NOT NULL,
      `acTitle` varchar(250) NOT NULL default '',
      `acContent` longtext NOT NULL,
      PRIMARY KEY  (`acID`)
    ) ENGINE=MyISAM COMMENT='ananse mini crawler';


SET UP THE URLs YOU WANT TO CRAWL (urls.txt)
======================================================
    Put the URLs you want to crawl in a text file named 
    "urls.txt". Must be in the same directory. Put each URL
    on a new line.


SET UP THE THE CONFIG FILE(config.php)
======================================================
<?    
    
#general settings
    
error_reporting(0);
    
ini_set("max_execution_time"0);
    
    
#A: mysql database server login settings
    
$dbhost "localhost";
    
$dbdb     "ananse";
    
$dbuser    "";
    
$dbpass "";
        
    
#B: mysql database storage details
    
define('table','ananse');
    
define('id_field','acID');            
    
define('url_field','acURL');
    
define('meta_description_field','acMetaDes');
    
define('meta_keywords_field','acMetaKey');
    
define('link_field','acLink');
    
define('title_field','acTitle');
    
define('content_field','acContent');
    
    
#connect to database server
    
$conn mysql_connect($dbhost$dbuser$dbpass) or die("Could not connect to server");
    
mysql_select_db($dbdb$conn) or die("Could not connect to database");
?>


SET UP THE THE MAIN CRAWLER PAGE (index.php)
======================================================
<?    
    
//includes
    
require_once("config.php");
    
    
//open the site for crawling
    
function open_url($url){    
        if(
fopen($url,'r')){
            return 
file_get_contents($url);
        } else {
            return 
false;
        }
    }
#end open url
        
    //collect links
    
function collect_links($url){
        
$link null;
        
$site open_url($url);
        
$combined["url"] = $url;
        
$combined['links'] = array();
        
        
preg_match_all("/<a.*?href[=]['"](.+?)['"].*?>(.+?)</a>/i",$site, $link, PREG_SET_ORDER);
        for($s=0; $s<count($link); $s++){
            array_shift($link[$s]);
            array_push($combined['
links'], $link[$s][0]);
        }
        $combined['
links'] = array_unique($combined['links']);
        $combined = array_unique($combined);
        return $combined;
    }#end collect links
        
    #get absolute path of link
    function locate($link, $url){
        $path = $link;
        if(!preg_match("(http://|www.)i", $link))$path = $url."/".$link;
        if(!fopen($path, "r")){
            return "";
        } else return $path;
    }#end get path
    
    //get meta tags
    function get_meta($link, $url){
        $link = locate($link, $url);
        $meta = array('
description', 'keywords');
        $metatags = get_meta_tags($link);
        $meta['
description'] = isset($metatags['description'])?$metatags['description']:"";
        $meta['
keywords'] = isset($metatags['keywords'])?$metatags['keywords']:"";
        return $meta;
    }#end meta tags
    
    //get page title
    function get_title($link, $url){
        $title = "";
        $match = array();
        $link = locate($link, $url);
        $content = file_get_contents($link);
        if(preg_match("/<title>(.+?)</title>/i", $content, $match)){
            $title = isset($match[1])?$match[1]:$title;
        }
        return $title;
    }#end page title
    
    //get page content
    function get_content($link, $url){
        $link = locate($link, $url);
        if(!empty($link)){
            return file_get_contents($link);
        } else return "";
    }#end page content
    
    //index all urls
    function index($url){
        if(!fopen($url, "r")){
            print "<br><h3><i>".$url."</i> could not be reached.</h3>";    
            flush();        
        } else {
            print "<script>window.status='
Indexing".$url."';</script>";
            print "<br><h3>Indexing <i>".$url."</i>...</h3>";
            print "<ol>";
            $collection = collect_links($url);
            $total = count($collection['
links']);
            for($i=0; $i<$total; $i++){
                $current = isset($collection['
links'][$i])?$collection['links'][$i]:"";
                if(eregi(" ",$current))$current = "";
                if(!empty($current)){                    
                    if(!get_content($current, $url)){
                        $reachable = false;
                    } else {
                        $reachable = true;
                    }
                    $meta = get_meta($current, $url);
                    
                    print "<li>".$current."</li>";
                    print "<script>window.status='
Current URL$url Current Position".($i+1)." of $total Current Link".$current."';</script>";
                    print "<script>document.title='
Indexing ".$current."';</script>";
                    flush();                    
                    
                    $matades = base64_encode($meta['
description']);
                    $metakey = base64_encode($meta['
keywords']);
                    $title = base64_encode(get_title($current, $url));
                    $content = base64_encode(strip_tags(get_content($current, $url)));
                    #end encoding
                                
                    $sql = "select ".id_field." from ".table." where ".url_field."='
$url' and ".link_field."='$current'";
                    $test = mysql_query($sql);
                    $len = mysql_num_rows($test);
                    
                    if($len == 0){
                        $doSql = "insert into ".table;
                        $doSql .= " (`".url_field."`,`".meta_description_field."`,`".meta_keywords_field."`,`".link_field."`,`".title_field."`,`".content_field."` )";
                        $doSql .= " values ('
$url', '$matades', '$metakey', '$current', '$title', '$content')";
                    } else {
                        $doSql = "update ".table;                    
                        $doSql .= "  set ".meta_description_field."='
$matades'";
                        $doSql .= " ,".meta_keywords_field."='
$metakey'";
                        $doSql .= "  , ".title_field."='
$title'";
                        $doSql .= "  , ".content_field."='
$content'";
                        $doSql .= "  where ".url_field."='
$url' and ".link_field."='$current'";
                    }
                    flush();
                    
                    if(mysql_query($doSql) and $reachable == true){
                        print " <b> OK </b>";
                    } elseif($reachable == false) {
                        print " <b> FAILED </b>";
                    }
                    flush();                
                }
            }
            print "</ol>";
            print "<h4>Done</h4>";
            print "<script>window.status='
$url indexed successfully!';</script>";
            print "<script>window.title='
$url indexed successfully!';</script>";
        }
    }#end index    



    #start crawling
    if(isset($_GET['
crawl'])){        
        $urlList = file("urls.txt");
        if(isset($_GET['
url'])){
            $i = $_GET['
offset'];
            $url = trim($urlList[$i]);            
        } else {
            $i = 0;
            $url = trim($urlList[0]);
        }
        index($url);
        
        mysql_close($conn);
        if($i < count($urlList)-1){
            print "<script>location='
index.php?crawl&url=".$url."&offset=".($i+1)."';</script>";
        } else {
            print "<script>location='
index.php';</script>";
        }
    } else {
        print "<a href="index.php?crawl">Crawl Entire List</a>";
    }
?>


Usage Example


See the example


Rate This Script





Search



This Category All Categories