Zend - The PHP Company




Utilities

Add Code


linkGrabber  

Type: class
Added by: akaridis
Entered: 07/12/2005
Last modified: 02/12/2005
Rating: - (fewer than 3 votes)
Views: 3194
This function grabs the links from one or more URLs or local files. It works with ALL links including image links. It also automatically corrects relative links.


<?php   
       
######################################################################### 
       #  linkGrabber.php  v1.1                                                                                                        # 
       #  -----------                                                                                                                        # 
       #  Copyright  (C)  2005  Aristidis  Karidis,  aris.karidis@bcs.org                        # 
       #  ----------------------------------------------------------                        # 
       #  This  function  grabs  the  links  from  one  or  more  URLs  or  local  files.          # 
       #                                                                                                                                                # 
       ######################################################################### 
       #                                                                                                                                                # 
       #  This  program  is  free  software;  you  can  redistribute  it  and/or                        # 
       #  modify  it  under  the  terms  of  the  GNU  General  Public  License                        # 
       #  as  published  by  the  Free  Software  Foundation;  either  version  2                # 
       #  of  the  License,  or  (at  your  option)  any  later  version.                                # 
       #                                                                                                                                                # 
       #  This  program  is  distributed  in  the  hope  that  it  will  be  useful,                # 
       #  but  WITHOUT  ANY  WARRANTY;  without  even  the  implied  warranty  of                # 
       #  MERCHANTABILITY  or  FITNESS  FOR  A  PARTICULAR  PURPOSE.    See  the                        # 
       #  GNU  General  Public  License  for  more  details.                                                        # 
       #  ------------------------------------                                                                        # 
       #  http://www.gnu.org/copyleft/gpl.html                                                                        # 
       ######################################################################### 
        
        
       /** 
         *  This  function  grabs  the  links  from  one  or  more  URLs  or  local  files.  It  works  with  ALL  links  on  a  page. 
         *  It  also  corrects  relative  links  and  grabs  image  links  as  well,  correcting  them  if  they  are  relative.   
         *  Links  inside  javascript  tags  don't  get  parsed  since  they  don't  exist  until  you  'do'  something  on  the  page. 
         *  If  applied  on  local  file(s)  it  then  grabs  even  the  links  inside  javascript  tags. 
         * 
         *  @param  array  $url  --  a  URL,  local  file  or  array  of  URLs/files. 
         *  @param  int  $unique  --  filters  duplicate  links  if  set  to  1;  doesn't  if  set  to  0. 
         *  @return  array 
         */ 
       
function  linkGrabber($url,  $unique  =  1
       { 
               
$startTag  =  '<a  '
               
$hrefTag  =  'href='
               
$label  =  ''
               
$endTag  =  '>'
               
$closingTag  =  '</a>'
               
$counter  =  0
                
               if(!
is_array($url)) 
               { 
                       
$url  =  array($url); 
               } 
                
               if  (
$unique  !==  0  &&  $unique  !==  1
               { 
                       
printf('Invalid  parameter  for  $unique.  The  parameter  must  be  either  1  or  0.'); 
                       exit(); 
               } 
                        
               foreach  (
$url  as  $value
               { 
                       
$contents  =  file_get_contents($value); 
                                
                       while  (
$contents
                       { 
                               
set_time_limit(0);                        #  In  case  we  have  several  large  pages 
                                
                               #################################################################################################### 
                               #  Find  the  first  '<a'  and  get  the  substring  from  there.                                                                                              # 
                               #  Checking  for  'href'  only  is  not  enough.  I  might  have  a  string  'href'  without  it  being  a  link.              # 
                               #  Checking  for  '<a  href'  is  not  enough  as  '<a  class="className"  href="www.example.com">'  is  valid.  # 
                               #  '<a  href="www.example.com"  class="className">'  is  also  valid.                                                                              # 
                               #  Need  to  fix  relative  links  and  relative  image  links.                                                                                              # 
                               #################################################################################################### 
                                
                               
$quotes  =  array('"',  "'"); 
                               
$contents  =  str_replace($quotes,  '',  $contents);                #  Strip  "  and  '  from  input  string 
                               
$contents  =  stristr($contents,  $startTag);                                #  Drop  everything  before  the  start  tag  '<a' 
                               
$contents  =  stristr($contents,  $hrefTag);                                #  Drop  everything  before  the  'href' 

                               
$endTagPosition  =  stripos($contents,  $endTag);                        #  Position  of  the  end  tag  '>'       
                               
$href  =  substr($contents,  5,  $endTagPosition  -  5);                #  Get  everything  from  href  to  end  tag  -->  'href="url"  something>' 
                                
                               
$spacePosition  =  stripos($href,  '  ');                                        #  Position  of  space  (if  it  exists)                                 
                                
                               
if  ($spacePosition  !==  false
                               { 
                                       
$href  =  substr($href,  0,  $spacePosition);                        #  Drop  everything  after  space,  keeping  'href="url"' 
                               

                                
                               
$contents  =  stristr($contents,  $endTag);                                #  Drop  everything  before  the  end  tag  '>' 
                                
                               
$closingTagPosition  =  stripos($contents,  $closingTag);        #  Position  of  the  closing  tag  '</a>'         
                               
$label  =  substr($contents,  1,  $closingTagPosition  -  1);        #  Everything  between  '>'  and  '</a>' 
                                
                               ################################################# 
                               #  Fix  relative  links  for  images  before  continue        #                         
                               ################################################# 
                                
                               
$imagePosition  =  stripos($label,  '<img');                                #  Position  of  the  image  tag  '<img  '  (if  it  exists) 
                                
                               
if  ($imagePosition  !==  false
                               {                                         
                                       
$src  =  stristr($label,  'src=');                                                #  Drop  everything  before  the  'src=' 
                                       
$src  =  substr($src,  4);                                                                #  Drop  'src=' 
                                       
$spacePosition  =  stripos($src,  '  ');                                #  Position  of  space  (if  it  exists)   
                                        
                                       
if  ($spacePosition  !==  false
                                       { 
                                               
$src  =  substr($src,  0,  $spacePosition);                        #  Drop  everything  after  space,  keeping  'src="url"' 
                                       

                                       else   
                                       { 
                                               
$src  =  substr($src,  0,  strlen($src)  -  1);                #  Drop  '>' 
                                       

                                        
                                       if  (
$src
                                       { 
                                               if  (
stripos($src,  '/')  ===  0)                                        #  Relative  link,  so  add  url  before  '/' 
                                               

                                                       
$src  =  $url[$counter].$src;                                         
                                               } 
                                               else   
                                               { 
                                                       if  (
stripos($src,  'http://')  !==  0  &&  stripos($src,  'https://')  !==  0
                                                       { 
                                                               
$src  =  $url[$counter].'/'.$src;                        #  Relative  link,  so  add  url  and  '/' 
                                                       

                                               } 
                                       } 
                                        
                                       
$label  =  '<img  border="0"  src='.$src.'>';                        #  Recreate  $label  with  fixed  image  links 
                               

                                
                               
######################### 
                               #  Done  with  image  links  # 
                               ######################### 
                                
                               
$contents  =  stristr($contents,  $closingTag);                        #  Drop  everything  before  the  closing  tag  '</a>' 
                                
                               
if  ($href
                               { 
                                       if  (
stripos($href,  '/')  ===  0
                                       { 
                                               
$href  =  $url[$counter].$href;                                        #  Relative  link,  so  add  url  before  '/' 
                                       

                                       else   
                                       { 
                                               if  (
stripos($href,  'http://')  !==  0  &&  stripos($href,  'https://')  !==  0  &&   
                                                       
stripos($href,  'mailto:')  !==  0  &&  stripos($href,  'ftp://')  !==  0
                                               { 
                                                       
$href  =  $url[$counter].'/'.$href;                        #  Relative  link,  so  add  url  and  '/' 
                                               

                                       } 
                               } 
                                
                               
$links['<a  href='.$href.'>'.$label.'</a>']  =  $href;                #  Create  array 
                       

                        
                       if  (
$unique  ===  1
                       { 
                               
$results[$url[$counter]]  =  array_unique($links);                #  Create  final  array  with  unique  links 
                       

                       else   
                       { 
                               
$results[$url[$counter]]  =  $links;                                                #  Create  final  array  with  all  links 
                       

                       
$links  =  array();                                                                                        #  Reset  links 
                       
$counter++;                                                                                                        #  Increment  counter 
               

                
               return  
$results
       } 
        
       
/** 
         *  Gets  an  array  of  links  and  shows  them  on  an  html  page. 
         *  If  $simplePresentation  =  0  it  shows  a  live  link  and  the  actual  link  text. 
         *  If  $simplePresentation  =  1  it  shows  only  link  text. 
         * 
         *  @param  array  $results 
         *  @param  int  $simplePresentation 
         */ 
       
function  showResults($results,  $simplePresentation  =  0
       { 
               if  (!
is_array($results)) 
               { 
                       
printf('Invalid  parameter  for  $results.  The  parameter  must  be  an  array.'); 
                       exit(); 
               } 
                
               if  (
$simplePresentation  !==  0  &&  $simplePresentation  !==  1
               { 
                       
printf('Invalid  parameter  for  $simplePresentation.  The  parameter  must  be  either  1  or  0.'); 
                       exit(); 
               } 
                
               
$counter  =  0
               
$total  =  0
                
               foreach  (
$results  as  $k  =>  $v
               {                         
                       foreach  (
$v  as  $key  =>  $value
                       { 
                               if  (
$simplePresentation  ===  0
                               { 
                                       
$counter++; 
                                        
                                       if  (
$counter  ===  1
                                       { 
                                               echo  
'<table  align="center">'
                                               echo  
'<tr><td  colspan="2"  bgcolor="Gray">Links  found  in  <a  href="'.$k.'">'.$k.'</a></td></tr>'
                                       } 
                                        
                                       echo  
'<tr><td  align="right">'.$key.'</td><td>'.$value.'</td></tr>'
                                        
                                       if  (
$counter  ===  count($v)) 
                                       { 
                                               echo  
'</table>'
                                       } 
                               } 
                               else   
                               { 
                                       
$counter++; 
                                        
                                       echo  
$value.'<br>'
                               } 
                       } 
                        
                       
$total  =  $total  +  $counter
                       
$counter  =  0
               } 
                
               echo  
'<br>Total  Number  of  Links:  '.$total
       } 
?>


Usage Example


<!-- 
       Example 
       Un-comment  showResults($links,  1);  to  see  the  alternative  output. 
       Un-comment  $links  =  linkGrabber($array,  0);  to  get  duplicate  links 
--> 
<html> 
<head> 
</head> 
<body> 
       <?php   
               $array  
=  array('http://www.google.com',  'http://www.php.net''http://www.zend.com''http://www.phparch.com''http://www.bbc.co.uk'); 
               
$links  =  linkGrabber($array); 
               
showResults($links); 
               
#$links  =  linkGrabber($array,  0); 
               #showResults($links,  1); 
       
?> 
</body> 
</html>


Rate This Script





Search



This Category All Categories