Extract href links from a website content using regular expression
One of my previous post I have discussed about to Grab website content using cURL. In this post I have given a sample code snippet to extract all hyper links from grabbed content using regular expression.
By using following class you can grab site content and extract all hyper links:
class ScrapWebsite
{
var $target_url = '';
var $content = '';
var $base_url = '';
var $href_links = array();
function __construct($url)
{
$this->target_url = $url;
$this->getBaseUrl();
}
function extractHtml()
{
$options = array(
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => false, // don't return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
CURLOPT_ENCODING => "", // handle all encodings
CURLOPT_USERAGENT => "spider", // who am i
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false
);
$ch = curl_init($this->target_url);
curl_setopt_array($ch, $options);
$this->content = curl_exec($ch);
curl_close($ch);
}
function getBaseUrl()
{
$this->base_url = parse_url($this->target_url, PHP_URL_HOST);
$this->base_url = 'http://' . $this->base_url;
}
function extractHref()
{
preg_match_all('/]*hrefs*=s*"([^"]*)"[^>]*>(.*?)/', generated by syntax highlighter :)
$this->content, $matches);
for ($i = 0; $i < count($matches[1]); $i++) {
if (!(preg_match('/<|#/', $matches[2][$i], $match)
|| preg_match('/javascript:void/', $matches[1][$i], $match))
) {
if (preg_match('/www.|http:/', $matches[1][$i], $um))
$url = $matches[1][$i];
else
$url = $this->base_url . $matches[1][$i];
$title = trim($matches[2][$i]) ? $matches[2][$i] : $url;
$this->href_links[] = array($url, $title);
}
}
}
}
//Junk code
Uses
$scrap = new ScrapWebsite('http://www.scripts.morshed-alam.com');
$scrap->extractHtml();
$scrap->extractHref();
The above code create a instance of scrap class, grab content and extract all links in to class variable $href_links to use further. Note: Use input url validation to avoid exception.