This function finds all the href links of a page including their anchor text as well.
This function is also capable of finding href links which either have an images or html tags in the text part. I have seen a lot of regular expressions in books and internet ,but they miss out those href links which have images or html tags in them.
This gives an array containing the links and their anchor texts .
This function is also capable of finding href links which either have an images or html tags in the text part. I have seen a lot of regular expressions in books and internet ,but they miss out those href links which have images or html tags in them.
This gives an array containing the links and their anchor texts .
<?php
/*
$content is an argument which contains the contents of html page in a string.
*/
function getURLsFromPage($content)
{
$images=getImageTags($content);
foreach($images as $image)
{
$content=str_replace($image,"",$content);
}
$anchors=getAnchorText($content);
foreach($anchors as $anchor)
{
$content=str_replace($anchor,strip_tags($anchor),$content);
}
//get all urls in a page irrespective of their domains...
$matches = array();
URL_pattern = "/\s+href\s*=\s*[\"\']?\s*([^\s\"\']+)[\"\'\s]?\s*[^>]*>([^<]*)<\/a>/ims";
preg_match_all ($URL_pattern, $content, $matches, PREG_PATTERN_ORDER);
$urls=array();
$anchors=$matches[2];
$i=0;
foreach($matches[1] as $match_key=>$match_value)
{
if(isset($anchors[$match_key]))
{
$urls[$i]['url']=$match_value;
$urls[$i]['anchor_text']=$anchors[$match_key];
$i++;
}
}
return $urls;
}
function getImageTags($content)
{
$pattern="/<\s*img\s*(.*?)>/i";
preg_match_all($pattern,$content,$matches,PREG_PATTERN_ORDER);
return $matches[0];
}
function getAnchorText($content)
{
$url_pattern="/<a(.*?)>(.*?)<\/a>/is";
preg_match_all($url_pattern,$content,$matches);
return $matches[2];
}
?>
Comments
Post a Comment