Utilities
|
|
|
|
<?php
#########################################################################
# linkGrabber.php v1.1 #
# ----------- #
# Copyright (C) 2005 Aristidis Karidis, aris.karidis@bcs.org #
# ---------------------------------------------------------- #
# This function grabs the links from one or more URLs or local files. #
# #
#########################################################################
# #
# This program is free software; you can redistribute it and/or #
# modify it under the terms of the GNU General Public License #
# as published by the Free Software Foundation; either version 2 #
# of the License, or (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# ------------------------------------ #
# http://www.gnu.org/copyleft/gpl.html #
#########################################################################
/**
* This function grabs the links from one or more URLs or local files. It works with ALL links on a page.
* It also corrects relative links and grabs image links as well, correcting them if they are relative.
* Links inside javascript tags don't get parsed since they don't exist until you 'do' something on the page.
* If applied on local file(s) it then grabs even the links inside javascript tags.
*
* @param array $url -- a URL, local file or array of URLs/files.
* @param int $unique -- filters duplicate links if set to 1; doesn't if set to 0.
* @return array
*/
function linkGrabber($url, $unique = 1)
{
$startTag = '<a ';
$hrefTag = 'href=';
$label = '';
$endTag = '>';
$closingTag = '</a>';
$counter = 0;
if(!is_array($url))
{
$url = array($url);
}
if ($unique !== 0 && $unique !== 1)
{
printf('Invalid parameter for $unique. The parameter must be either 1 or 0.');
exit();
}
foreach ($url as $value)
{
$contents = file_get_contents($value);
while ($contents)
{
set_time_limit(0); # In case we have several large pages
####################################################################################################
# Find the first '<a' and get the substring from there. #
# Checking for 'href' only is not enough. I might have a string 'href' without it being a link. #
# Checking for '<a href' is not enough as '<a class="className" href="www.example.com">' is valid. #
# '<a href="www.example.com" class="className">' is also valid. #
# Need to fix relative links and relative image links. #
####################################################################################################
$quotes = array('"', "'");
$contents = str_replace($quotes, '', $contents); # Strip " and ' from input string
$contents = stristr($contents, $startTag); # Drop everything before the start tag '<a'
$contents = stristr($contents, $hrefTag); # Drop everything before the 'href'
$endTagPosition = stripos($contents, $endTag); # Position of the end tag '>'
$href = substr($contents, 5, $endTagPosition - 5); # Get everything from href to end tag --> 'href="url" something>'
$spacePosition = stripos($href, ' '); # Position of space (if it exists)
if ($spacePosition !== false)
{
$href = substr($href, 0, $spacePosition); # Drop everything after space, keeping 'href="url"'
}
$contents = stristr($contents, $endTag); # Drop everything before the end tag '>'
$closingTagPosition = stripos($contents, $closingTag); # Position of the closing tag '</a>'
$label = substr($contents, 1, $closingTagPosition - 1); # Everything between '>' and '</a>'
#################################################
# Fix relative links for images before continue #
#################################################
$imagePosition = stripos($label, '<img'); # Position of the image tag '<img ' (if it exists)
if ($imagePosition !== false)
{
$src = stristr($label, 'src='); # Drop everything before the 'src='
$src = substr($src, 4); # Drop 'src='
$spacePosition = stripos($src, ' '); # Position of space (if it exists)
if ($spacePosition !== false)
{
$src = substr($src, 0, $spacePosition); # Drop everything after space, keeping 'src="url"'
}
else
{
$src = substr($src, 0, strlen($src) - 1); # Drop '>'
}
if ($src)
{
if (stripos($src, '/') === 0) # Relative link, so add url before '/'
{
$src = $url[$counter].$src;
}
else
{
if (stripos($src, 'http://') !== 0 && stripos($src, 'https://') !== 0)
{
$src = $url[$counter].'/'.$src; # Relative link, so add url and '/'
}
}
}
$label = '<img border="0" src='.$src.'>'; # Recreate $label with fixed image links
}
#########################
# Done with image links #
#########################
$contents = stristr($contents, $closingTag); # Drop everything before the closing tag '</a>'
if ($href)
{
if (stripos($href, '/') === 0)
{
$href = $url[$counter].$href; # Relative link, so add url before '/'
}
else
{
if (stripos($href, 'http://') !== 0 && stripos($href, 'https://') !== 0 &&
stripos($href, 'mailto:') !== 0 && stripos($href, 'ftp://') !== 0)
{
$href = $url[$counter].'/'.$href; # Relative link, so add url and '/'
}
}
}
$links['<a href='.$href.'>'.$label.'</a>'] = $href; # Create array
}
if ($unique === 1)
{
$results[$url[$counter]] = array_unique($links); # Create final array with unique links
}
else
{
$results[$url[$counter]] = $links; # Create final array with all links
}
$links = array(); # Reset links
$counter++; # Increment counter
}
return $results;
}
/**
* Gets an array of links and shows them on an html page.
* If $simplePresentation = 0 it shows a live link and the actual link text.
* If $simplePresentation = 1 it shows only link text.
*
* @param array $results
* @param int $simplePresentation
*/
function showResults($results, $simplePresentation = 0)
{
if (!is_array($results))
{
printf('Invalid parameter for $results. The parameter must be an array.');
exit();
}
if ($simplePresentation !== 0 && $simplePresentation !== 1)
{
printf('Invalid parameter for $simplePresentation. The parameter must be either 1 or 0.');
exit();
}
$counter = 0;
$total = 0;
foreach ($results as $k => $v)
{
foreach ($v as $key => $value)
{
if ($simplePresentation === 0)
{
$counter++;
if ($counter === 1)
{
echo '<table align="center">';
echo '<tr><td colspan="2" bgcolor="Gray">Links found in <a href="'.$k.'">'.$k.'</a></td></tr>';
}
echo '<tr><td align="right">'.$key.'</td><td>'.$value.'</td></tr>';
if ($counter === count($v))
{
echo '</table>';
}
}
else
{
$counter++;
echo $value.'<br>';
}
}
$total = $total + $counter;
$counter = 0;
}
echo '<br>Total Number of Links: '.$total;
} ?>
|
|
|
Usage Example
|
<!--
Example
Un-comment showResults($links, 1); to see the alternative output.
Un-comment $links = linkGrabber($array, 0); to get duplicate links
-->
<html>
<head>
</head>
<body>
<?php
$array = array('http://www.google.com', 'http://www.php.net', 'http://www.zend.com', 'http://www.phparch.com', 'http://www.bbc.co.uk');
$links = linkGrabber($array);
showResults($links);
#$links = linkGrabber($array, 0);
#showResults($links, 1);
?>
</body>
</html>
|
|
|
Rate This Script
|
|
|
|