Extract href links from a website content using regular expression

One of my previous post I have discussed about to Grab website content using cURL. In this post I have given a sample code snippet to extract all hyper links from grabbed content using regular expression.

By using following class you can grab site content and extract all hyper links:

class ScrapWebsite
{
var $target_url = '';
var $content = '';
var $base_url = '';
var $href_links = array();

function __construct($url)
{
$this->target_url = $url;
$this->getBaseUrl();
}

function extractHtml()
{
$options = array(
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => false, // don't return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
CURLOPT_ENCODING => "", // handle all encodings
CURLOPT_USERAGENT => "spider", // who am i
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false
);

$ch = curl_init($this->target_url);
curl_setopt_array($ch, $options);
$this->content = curl_exec($ch);

curl_close($ch);
}

function getBaseUrl()
{
$this->base_url = parse_url($this->target_url, PHP_URL_HOST);
$this->base_url = 'http://' . $this->base_url;
}

function extractHref()
{
preg_match_all('/]*hrefs*=s*"([^"]*)"[^>]*>(.*?)/',
$this->content, $matches);

for ($i = 0; $i < count($matches[1]); $i++) {
if (!(preg_match('/<|#/', $matches[2][$i], $match)
|| preg_match('/javascript:void/', $matches[1][$i], $match))
) {
if (preg_match('/www.|http:/', $matches[1][$i], $um))
$url = $matches[1][$i];
else
$url = $this->base_url . $matches[1][$i];

$title = trim($matches[2][$i]) ? $matches[2][$i] : $url;

$this->href_links[] = array($url, $title);
}
}
}
}
//Junk code
generated by syntax highlighter :)

Uses

$scrap = new ScrapWebsite('http://www.scripts.morshed-alam.com');
$scrap->extractHtml();
$scrap->extractHref();

The above code create a instance of scrap class, grab content and extract all links in to class variable $href_links to use further. Note: Use input url validation to avoid exception.

Click here to see the demo

Leave a Comment