/*
Ananse v1.1 is still under development. This is just
the second experimental module of the "ananse" project.
Developed by Cornelius Duhadzi, CEO and Creative Director
of Devonet Multimedia, Accra, Ghana. http://www.devonet.com
Download ananse v1.1 at: http://www.devonet.com/free.
Send your comments, questions and suggestions to
mynelis@gmail.com
*/
SET UP THE DATABASE
======================================================
Create a database named "mycrawler"
Create a table called "ananse" in the "mycrawler" as below:
CREATE TABLE `ananse` (
`acID` bigint(20) NOT NULL auto_increment,
`acURL` varchar(250) default NULL,
`acMetaDes` text NOT NULL,
`acMetaKey` text NOT NULL,
`acLink` text NOT NULL,
`acTitle` varchar(250) NOT NULL default '',
`acContent` longtext NOT NULL,
PRIMARY KEY (`acID`)
) ENGINE=MyISAM COMMENT='ananse mini crawler';
SET UP THE URLs YOU WANT TO CRAWL (urls.txt)
======================================================
Put the URLs you want to crawl in a text file named
"urls.txt". Must be in the same directory. Put each URL
on a new line.
SET UP THE THE CONFIG FILE(config.php)
======================================================
<?
#general settings
error_reporting(0);
ini_set("max_execution_time", 0);
#A: mysql database server login settings
$dbhost = "localhost";
$dbdb = "ananse";
$dbuser = "";
$dbpass = "";
#B: mysql database storage details
define('table','ananse');
define('id_field','acID');
define('url_field','acURL');
define('meta_description_field','acMetaDes');
define('meta_keywords_field','acMetaKey');
define('link_field','acLink');
define('title_field','acTitle');
define('content_field','acContent');
#connect to database server
$conn = mysql_connect($dbhost, $dbuser, $dbpass) or die("Could not connect to server");
mysql_select_db($dbdb, $conn) or die("Could not connect to database"); ?>
SET UP THE THE MAIN CRAWLER PAGE (index.php)
======================================================
<?
//includes
require_once("config.php");
//open the site for crawling
function open_url($url){
if(fopen($url,'r')){
return file_get_contents($url);
} else {
return false;
}
}#end open url
//collect links
function collect_links($url){
$link = null;
$site = open_url($url);
$combined["url"] = $url;
$combined['links'] = array();
preg_match_all("/<a.*?href[=]['"](.+?)['"].*?>(.+?)</a>/i",$site, $link, PREG_SET_ORDER);
for($s=0; $s<count($link); $s++){
array_shift($link[$s]);
array_push($combined['links'], $link[$s][0]);
}
$combined['links'] = array_unique($combined['links']);
$combined = array_unique($combined);
return $combined;
}#end collect links
#get absolute path of link
function locate($link, $url){
$path = $link;
if(!preg_match("(http://|www.)i", $link))$path = $url."/".$link;
if(!fopen($path, "r")){
return "";
} else return $path;
}#end get path
//get meta tags
function get_meta($link, $url){
$link = locate($link, $url);
$meta = array('description', 'keywords');
$metatags = get_meta_tags($link);
$meta['description'] = isset($metatags['description'])?$metatags['description']:"";
$meta['keywords'] = isset($metatags['keywords'])?$metatags['keywords']:"";
return $meta;
}#end meta tags
//get page title
function get_title($link, $url){
$title = "";
$match = array();
$link = locate($link, $url);
$content = file_get_contents($link);
if(preg_match("/<title>(.+?)</title>/i", $content, $match)){
$title = isset($match[1])?$match[1]:$title;
}
return $title;
}#end page title
//get page content
function get_content($link, $url){
$link = locate($link, $url);
if(!empty($link)){
return file_get_contents($link);
} else return "";
}#end page content
//index all urls
function index($url){
if(!fopen($url, "r")){
print "<br><h3><i>".$url."</i> could not be reached.</h3>";
flush();
} else {
print "<script>window.status='Indexing: ".$url."';</script>";
print "<br><h3>Indexing <i>".$url."</i>...</h3>";
print "<ol>";
$collection = collect_links($url);
$total = count($collection['links']);
for($i=0; $i<$total; $i++){
$current = isset($collection['links'][$i])?$collection['links'][$i]:"";
if(eregi(" ",$current))$current = "";
if(!empty($current)){
if(!get_content($current, $url)){
$reachable = false;
} else {
$reachable = true;
}
$meta = get_meta($current, $url);
print "<li>".$current."</li>";
print "<script>window.status='Current URL: $url | Current Position: ".($i+1)." of $total | Current Link: ".$current."';</script>";
print "<script>document.title='Indexing ".$current."';</script>";
flush();
$matades = base64_encode($meta['description']);
$metakey = base64_encode($meta['keywords']);
$title = base64_encode(get_title($current, $url));
$content = base64_encode(strip_tags(get_content($current, $url)));
#end encoding
$sql = "select ".id_field." from ".table." where ".url_field."='$url' and ".link_field."='$current'";
$test = mysql_query($sql);
$len = mysql_num_rows($test);
if($len == 0){
$doSql = "insert into ".table;
$doSql .= " (`".url_field."`,`".meta_description_field."`,`".meta_keywords_field."`,`".link_field."`,`".title_field."`,`".content_field."` )";
$doSql .= " values ('$url', '$matades', '$metakey', '$current', '$title', '$content')";
} else {
$doSql = "update ".table;
$doSql .= " set ".meta_description_field."='$matades'";
$doSql .= " ,".meta_keywords_field."='$metakey'";
$doSql .= " , ".title_field."='$title'";
$doSql .= " , ".content_field."='$content'";
$doSql .= " where ".url_field."='$url' and ".link_field."='$current'";
}
flush();
if(mysql_query($doSql) and $reachable == true){
print " <b> OK </b>";
} elseif($reachable == false) {
print " <b> FAILED </b>";
}
flush();
}
}
print "</ol>";
print "<h4>Done</h4>";
print "<script>window.status='$url indexed successfully!';</script>";
print "<script>window.title='$url indexed successfully!';</script>";
}
}#end index
#start crawling
if(isset($_GET['crawl'])){
$urlList = file("urls.txt");
if(isset($_GET['url'])){
$i = $_GET['offset'];
$url = trim($urlList[$i]);
} else {
$i = 0;
$url = trim($urlList[0]);
}
index($url);
mysql_close($conn);
if($i < count($urlList)-1){
print "<script>location='index.php?crawl&url=".$url."&offset=".($i+1)."';</script>";
} else {
print "<script>location='index.php';</script>";
}
} else {
print "<a href="index.php?crawl">Crawl Entire List</a>";
}
?>
|
|