如何扒网站
2016-07-09 13:14
537 查看
首先定义一个cg_http的类
使用curl()方法将网页url写入指定的文件夹
如果是多个url地址也可以批量操作
读取D://aa写入的文件
使用curl()方法将网页url写入指定的文件夹
class cg_http { private $agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'; private $post_fields = array(); private $post_files = array(); private $headers = array(); private $cookies = array(); private $curl; private $url = ''; public function __construct($url = '') { $this->curl = curl_init(); if ($url != '') { $this->set_url($url); } } public function set_url($url) { $this->url = $url; curl_setopt($this->curl, CURLOPT_URL, $this->url); curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($this->curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($this->curl, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($this->curl, CURLOPT_AUTOREFERER, true); curl_setopt($this->curl, CURLOPT_CONNECTTIMEOUT, $this->timeout); curl_setopt($this->curl, CURLOPT_HEADER, false); $this->set_agent($this->agent); } public function set_agent($agent) { curl_setopt($this->curl, CURLOPT_USERAGENT, $agent); } public function send_request($data = '', $transcoding = false) { if ($this->url == '') { die('url is empty'); } if (is_string($this->cookies)) { curl_setopt($this->curl, CURLOPT_COOKIE, $this->cookies); } elseif (is_array($this->cookies)) { if (count($this->cookies) > 0) { curl_setopt($this->curl, CURLOPT_COOKIE, implode(';', $this->cookies)); } } if (count($this->post_fields) > 0) { curl_setopt($this->curl, CURLOPT_POST, true); if ($this->has_file) { curl_setopt($this->curl, CURLOPT_POSTFIELDS, $this->post_fields); } else { curl_setopt($this->curl, CURLOPT_POSTFIELDS, http_build_query($this->post_fields)); } } if (count($this->headers) > 0) { curl_setopt($this->curl, CURLOPT_HTTPHEADER, $this->headers); } if (!empty($data)) { curl_setopt($this->curl, CURLOPT_POSTFIELDS, $data); } // 支持获取gzip等编码类型压缩过的网页 curl_setopt($this->curl, CURLOPT_ENCODING, ''); $result = curl_exec($this->curl); if ($transcoding) { 4000 // 检测网页编码 $encoding = mb_detect_encoding($result, 'GB2312, GBK, UTF-8'); if ($encoding != 'UTF-8') { // 转换网页编码 $result = iconv('gbk', 'utf-8//IGNORE', $result); } } $error = curl_error($this->curl); if (!empty($error)) { //return $error; } curl_close($this->curl); return $result; } } //$web_url为目标网页地址 $web_url = "http://www.baidu.com?curl.php"; $webpage = new cg_http($web_url); $content = $webpage->send_request(); $filename = basename($web_url); //这里我们指定一个文件夹 将获取的网页内容存到aa文件夹里 file_put_contents("D://aa",$filename,$content);
如果是多个url地址也可以批量操作
//将需要操作的url放入数组中 foreach($arr_url as $key => $urls) { $webpage = new cg_http($urls); $content = $webpage->send_request(); $filename = "{$key}"; file_put_contents("D://aa".$filename , $content); }
读取D://aa写入的文件
$dir = "D://aa"; $handler = opendir($dir); while(($file = readdir($handler))!=false) { if($file == ".." || $file == ".") { continue; } $content = file_get_contents($dir.$file); //替换掉源代码中的回车 空格 制表符 $content = $str_replace("\n",'',$content); $content = $str_replace("\t",'',$content); $content = $str_replace("\r",'',$content); //PCRE 你要匹配的正则表达式 preg_match_all("PCRE",$content,$matches); }
相关文章推荐
- java-用HttpURLConnection发送Http请求.
- Android Native 绘图方法
- VBScript 剪贴板抓取URL并在浏览器中打开
- C#中struct和class的区别详解
- VBS ArrayList Class vbs中的数组类
- 用vbs 实现从剪贴板中抓取一个 URL 然后在浏览器中打开该 Web 站点
- 大家看了就明白了css样式中类class与标识id选择符的区别小结
- asp获取URL参数的几种方法分析总结[原创]_应用技巧_脚本之家
- C#实现在网页中根据url截图并输出到网页的方法
- php下目前为目最全的CURL中文说明
- zend framework框架中url大小写问题解决方法
- PHP curl_setopt()函数实例代码与参数分析
- php采用curl访问域名返回405 method not allowed提示的解决方法
- php中file_get_content 和curl以及fopen 效率分析
- php中curl和file_get_content的区别
- php实现curl模拟ftp上传的方法
- PHP简单开启curl的方法(测试可行)[原创]_php技巧_脚本之家
- url decode problem 解决方法
- php url地址栏传中文乱码解决方法集合
- 深入了解PHP类Class的概念