您的位置:首页 > 运维架构 > 网站架构

如何扒网站

2016-07-09 13:14 537 查看
首先定义一个cg_http的类

使用curl()方法将网页url写入指定的文件夹

class cg_http
{
private $agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36';
private $post_fields = array();
private $post_files = array();
private $headers = array();
private $cookies = array();
private $curl;
private $url = '';

public function __construct($url = '')
{
$this->curl = curl_init();
if ($url != '')
{
$this->set_url($url);
}
}

public function set_url($url)
{
$this->url = $url;
curl_setopt($this->curl, CURLOPT_URL, $this->url);
curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($this->curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($this->curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($this->curl, CURLOPT_AUTOREFERER, true);
curl_setopt($this->curl, CURLOPT_CONNECTTIMEOUT, $this->timeout);
curl_setopt($this->curl, CURLOPT_HEADER, false);
$this->set_agent($this->agent);
}

public function set_agent($agent)
{
curl_setopt($this->curl, CURLOPT_USERAGENT, $agent);
}

public function send_request($data = '', $transcoding = false)
{
if ($this->url == '')
{
die('url is empty');
}
if (is_string($this->cookies))
{
curl_setopt($this->curl, CURLOPT_COOKIE, $this->cookies);
}
elseif (is_array($this->cookies))
{
if (count($this->cookies) > 0)
{
curl_setopt($this->curl, CURLOPT_COOKIE, implode(';', $this->cookies));
}
}

if (count($this->post_fields) > 0)
{
curl_setopt($this->curl, CURLOPT_POST, true);
if ($this->has_file)
{
curl_setopt($this->curl, CURLOPT_POSTFIELDS, $this->post_fields);
}
else
{
curl_setopt($this->curl, CURLOPT_POSTFIELDS, http_build_query($this->post_fields));
}
}
if (count($this->headers) > 0)
{
curl_setopt($this->curl, CURLOPT_HTTPHEADER, $this->headers);
}
if (!empty($data))
{
curl_setopt($this->curl, CURLOPT_POSTFIELDS, $data);
}
// 支持获取gzip等编码类型压缩过的网页
curl_setopt($this->curl, CURLOPT_ENCODING, '');

$result = curl_exec($this->curl);
if ($transcoding)
{

4000
// 检测网页编码
$encoding = mb_detect_encoding($result, 'GB2312, GBK, UTF-8');
if ($encoding != 'UTF-8')
{
// 转换网页编码
$result = iconv('gbk', 'utf-8//IGNORE', $result);
}
}
$error = curl_error($this->curl);
if (!empty($error))
{
//return $error;
}
curl_close($this->curl);
return $result;
}
}
//$web_url为目标网页地址
$web_url = "http://www.baidu.com?curl.php";
$webpage = new cg_http($web_url);
$content = $webpage->send_request();
$filename = basename($web_url);
//这里我们指定一个文件夹 将获取的网页内容存到aa文件夹里
file_put_contents("D://aa",$filename,$content);


如果是多个url地址也可以批量操作

//将需要操作的url放入数组中
foreach($arr_url as $key =>  $urls)
{
$webpage = new cg_http($urls);
$content = $webpage->send_request();
$filename = "{$key}";
file_put_contents("D://aa".$filename , $content);
}


读取D://aa写入的文件

$dir = "D://aa";
$handler = opendir($dir);
while(($file = readdir($handler))!=false)
{
if($file == ".." || $file == ".")
{
continue;
}
$content = file_get_contents($dir.$file);
//替换掉源代码中的回车 空格 制表符
$content = $str_replace("\n",'',$content);
$content = $str_replace("\t",'',$content);
$content = $str_replace("\r",'',$content);
//PCRE 你要匹配的正则表达式
preg_match_all("PCRE",$content,$matches);
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息