您的位置:首页 > 编程语言 > PHP开发

PHP正则采集类,采集moviereleased.net站点,可以无限扩展

2012-01-10 10:35 357 查看
<?php
define('COOKIE_PATH',dirname(__FILE__).'/cookie.txt');

class Collection {
private $_url;
private $_regex;
private $_match_href = array();
private $_cookie_file = COOKIE_PATH;
private $_login_url;
private $_detail = array();

function __construct() {
}

function setUrl($url) {
if(!is_array($url)){
return false;
}
$this->_url = $url['target_url'];
if($url['login_url']){
$this->_login_url = $url['login_url'];
}
}

function setRegex($regex) {
if(!is_array($regex)){
return false;
}
$this->_regex = $regex;
}

function userLogin($user_data) {
if(!file_exists($this->_cookie_file)){
file_put_contents($this->_cookie_file,'');
}
$this->getData($this->_login_url, $user_data, $this->_cookie_file);
}

function matchLink() {
$data = $this->getData($this->_url);
preg_match_all(array_shift($this->_regex),$data,$match);
$second = array_shift($this->_regex);
foreach($match as $key=>$value){
if(is_int($key)) continue;
foreach($value as $v){
preg_match($second,$v,$matched);
$this->_match_href[] = $matched['href'];
}
}
return $this->_match_href;
}

function matchDetail($user_data=false) {
$this->matchLink();
if($user_data){
$this->userLogin($user_data);
}
if(empty($this->_match_href)) return false;
foreach($this->_match_href as $m){
$detail = $this->getData($m,false, $this->_cookie_file);
foreach($this->_regex as $key=>$val){
preg_match($this->_regex[$key],$detail,$match);
$this->_detail[$key][] = $match[$key];
}
}
return $this->_detail;
}

function getData($url, $data=false,$cookie_file=false,$timeout=3) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
if($data){
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
}
if($cookie_file){
curl_setopt($ch, CURLOPT_COOKIEFILE,$cookie_file);
curl_setopt($ch, CURLOPT_COOKIEJAR,$cookie_file);
}
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
}

$c = new Collection();
$regex = array(
'list_h2'=>'/<h2\s*class="\s*posttitle\s*"\s*>(?<link>.*?)<\/h2>/is',
'alink'=>'/<a\s*href="(?<href>.*?)">.*?<\/a>/is',
'post_title'=>'/<h2\s*class="\s*posttitle\s*"\s*><a.*?href=".*?".*?>(?<post_title>.*?)<\/a><\/h2>/is',
'post_content'=>'/<div\s*class="postcontent"\s*><p>(?<post_content>.*?)<div\s*class="wumii-hook">/is',
'post_img'=>'/<div\s*class="postcontent"\s*><p><a\s*href="(?<large_img>.*?)"\s*><img.*?src="(?<post_img>.*?)".*?><\/a>/is',
'review'=>'/<li.*?class="comment\s*byuser\s*comment-author-admin.*?".*?>(?<review>.*?)<\/li>/is',
);
$url = array('target_url'=>'http://moviereleased.net/','login_url'=>'http://moviereleased.net/wp-login.php');

$c->setRegex($regex);
$c->setUrl($url);

$user_data = 'log=testuser&pwd=testuser';
$data = $c->matchDetail($user_data);

print_r($data['post_img']);
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: