Friday, June 13, 2008

Extract href links from a website content using regular expression

One of my previous post I have discussed about to Grab website content using cURL. In this post I have given a sample code snippet to extract all hyper links from grabbed content using regular expression.

By using following class you can grab site content and extract all hyper links:
class ScrapWebsite
{
    var $target_url = '';
    var $content = '';
    var $base_url = '';
    var $href_links = array();

    function __construct($url)
    {
        $this->target_url = $url;
        $this->getBaseUrl();
    }

    function extractHtml()
    {
        $options = array(
            CURLOPT_RETURNTRANSFER => true, // return web page
            CURLOPT_HEADER => false, // don't return headers
            CURLOPT_FOLLOWLOCATION => true, // follow redirects
            CURLOPT_ENCODING => "", // handle all encodings
            CURLOPT_USERAGENT => "spider", // who am i
            CURLOPT_AUTOREFERER => true, // set referer on redirect
            CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
            CURLOPT_TIMEOUT => 120, // timeout on response
            CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
            CURLOPT_SSL_VERIFYPEER => false,
            CURLOPT_SSL_VERIFYHOST => false
        );

        $ch = curl_init($this->target_url);
        curl_setopt_array($ch, $options);
        $this->content = curl_exec($ch);

        curl_close($ch);
    }

    function getBaseUrl()
    {
        $this->base_url = parse_url($this->target_url, PHP_URL_HOST);
        $this->base_url = 'http://' . $this->base_url;
    }

    function extractHref()
    {
        preg_match_all('/\]*href\s*=\s*\"([^\"]*)\"[^>]*>(.*?)<\/a>/', 
        $this->content, $matches);

        for ($i = 0; $i < count($matches[1]); $i++) {
            if (!(preg_match('/\<|#/', $matches[2][$i], $match)
                || preg_match('/javascript:void/', $matches[1][$i], $match))
            ) {
                if (preg_match('/www\.|http:/', $matches[1][$i], $um))
                    $url = $matches[1][$i];
                else
                    $url = $this->base_url . $matches[1][$i];

                $title = trim($matches[2][$i]) ? $matches[2][$i] : $url;

                $this->href_links[] = array($url, $title);
            }
        }
    }
}
//Junk code  generated by syntax highlighter :)

Uses
$scrap = new ScrapWebsite('http://www.scripts.morshed-alam.com');
$scrap->extractHtml();
$scrap->extractHref();
The above code create a instance of scrap class, grab content and extract all links in to class variable $href_links to use further. Note: Use input url validation to avoid exception.

Click here to see the demo

No comments:

Post a Comment