One of my previous post I have discussed about to Grab website content using cURL. In this post I have given a sample code snippet to extract all hyper links from grabbed content using regular expression.
By using following class you can grab site content and extract all hyper links:
Uses
Click here to see the demo
By using following class you can grab site content and extract all hyper links:
class ScrapWebsite { var $target_url = ''; var $content = ''; var $base_url = ''; var $href_links = array(); function __construct($url) { $this->target_url = $url; $this->getBaseUrl(); } function extractHtml() { $options = array( CURLOPT_RETURNTRANSFER => true, // return web page CURLOPT_HEADER => false, // don't return headers CURLOPT_FOLLOWLOCATION => true, // follow redirects CURLOPT_ENCODING => "", // handle all encodings CURLOPT_USERAGENT => "spider", // who am i CURLOPT_AUTOREFERER => true, // set referer on redirect CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect CURLOPT_TIMEOUT => 120, // timeout on response CURLOPT_MAXREDIRS => 10, // stop after 10 redirects CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false ); $ch = curl_init($this->target_url); curl_setopt_array($ch, $options); $this->content = curl_exec($ch); curl_close($ch); } function getBaseUrl() { $this->base_url = parse_url($this->target_url, PHP_URL_HOST); $this->base_url = 'http://' . $this->base_url; } function extractHref() { preg_match_all('/\]*href\s*=\s*\"([^\"]*)\"[^>]*>(.*?)<\/a>/', $this->content, $matches); for ($i = 0; $i < count($matches[1]); $i++) { if (!(preg_match('/\<|#/', $matches[2][$i], $match) || preg_match('/javascript:void/', $matches[1][$i], $match)) ) { if (preg_match('/www\.|http:/', $matches[1][$i], $um)) $url = $matches[1][$i]; else $url = $this->base_url . $matches[1][$i]; $title = trim($matches[2][$i]) ? $matches[2][$i] : $url; $this->href_links[] = array($url, $title); } } } } //Junk code generated by syntax highlighter :)
Uses
$scrap = new ScrapWebsite('http://www.scripts.morshed-alam.com'); $scrap->extractHtml(); $scrap->extractHref();The above code create a instance of scrap class, grab content and extract all links in to class variable $href_links to use further. Note: Use input url validation to avoid exception.
Click here to see the demo
No comments:
Post a Comment