网站信息采集
2015-12-25 10:19
621 查看
<?php
/**
* @MT制作的爬虫
*/
header("Content-type: text/html; charset=utf-8");
$dir = "html"; // 要获取的目录
$num = 0;
movePath($dir);
function movePath($dir)
{
if (is_dir($dir)) {
if ($dh = opendir($dir)) {
while (($file = readdir($dh)) != false) {
if ($file != "." && $file != "..") {
if (strpos($file, ".html")) {
if (is_numeric(substr($file, 0, strpos($file, ".html")))) {
fillRead($dir . "/" . $file);
}
} else {
movePath($dir . "/" . $file);
}
}
}
closedir($dh);
}
}
}
/*
* $conn = mysql_connect("localhost","root","");
* if (!$conn){
* die('Could not connect: ' . mysql_error());
* }
*/
function fillRead($file)
{
$array = array();
$myfile = fopen($file, "r") or die("Unable to open file!");
$return = fread($myfile, filesize($file));
$title = "/<div class=\"title\">(.*?)<\/div>/ism";
if (preg_match_all($title, $return, $matches)) {
$array["title"] = $matches[1][0];
}
$content = "/<div class=\"arcContent\">(.*?)<\/div>/ism";
if (preg_match_all($content, $return, $matches)) {
$array["countent"] = $matches[1][0];
}
$category = "/class=\"box2.*?labox.*?mr_5.*?f_l\">.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?>/ism";
if (preg_match_all($category, $return, $matches)) {
$array["category"] = $matches[2][0];
$array["parentcategory"] = $matches[3][0];
}
$send = "/class=\"info\">.*?<small>(.*?)<\/div>/ism";
if (preg_match_all($send, $return, $matches)) {
foreach (explode(" ", $matches[1][0]) as $key => $val) {
$text = str_replace("\t", "", $val);
$text = str_replace("\r\n", "", $text);
$small = "/(<\/small>)+(.*)/i";
preg_match_all($small, $text, $matches);
if (! empty($matches[2])) {
switch ($key) {
case 0: //
$array["public_time"] = strtotime($matches[2][0]);
break;
case 1:
$array["laiyuan"] = $matches[2][0];
break;
case 2:
$array["author"] = $matches[2][0];
break;
case 3:
break;
}
}
}
}
$_SERVER["xixi"][] = $array;
fclose($myfile);
}
$conn = @mysql_connect("localhost","root","");
if (!$conn){
die('Could not connect: ' . mysql_error());
}
mysql_select_db("cmysw",$conn);
mysql_query("set names utf8");
$cid;
$pid;
foreach ($_SERVER["xixi"] as $v=>$row){
$mysql= @mysql_query("select * from cx_category where name like '%".$row["category"]."%'");
$mysqlparent= @mysql_query("select * from cx_category where name like '%".$row["parentcategory"]."%'");
while ($val=mysql_fetch_array($mysql)){
global $cid;
$cid=$val["id"];
}
while ($val1=mysql_fetch_array($mysqlparent)){
global $pid;
$pid=$val1["id"];
}
if(!empty($pid)){
$summary=preg_replace("/(\s|\ \;| |\xc2\xa0)/", "", strip_tags($row["countent"]));
$summary= mb_substr($summary,0,50,'utf-8');
if($row["title"]!=""){
$cond=mysql_query("insert into cx_article(cid,title,title_color,author,editor,summary,content,publish_date,create_date,keywords,article_view,is_check,is_pass,create_user)
values('$pid','{$row["title"]}','','{$row["author"]}','{$row["laiyuan"]}','$summary','{$row["countent"]}','{$row["public_time"]}','{$row["public_time"]}','','','1','1','1')",$conn);
}
}
}
?>
/**
* @MT制作的爬虫
*/
header("Content-type: text/html; charset=utf-8");
$dir = "html"; // 要获取的目录
$num = 0;
movePath($dir);
function movePath($dir)
{
if (is_dir($dir)) {
if ($dh = opendir($dir)) {
while (($file = readdir($dh)) != false) {
if ($file != "." && $file != "..") {
if (strpos($file, ".html")) {
if (is_numeric(substr($file, 0, strpos($file, ".html")))) {
fillRead($dir . "/" . $file);
}
} else {
movePath($dir . "/" . $file);
}
}
}
closedir($dh);
}
}
}
/*
* $conn = mysql_connect("localhost","root","");
* if (!$conn){
* die('Could not connect: ' . mysql_error());
* }
*/
function fillRead($file)
{
$array = array();
$myfile = fopen($file, "r") or die("Unable to open file!");
$return = fread($myfile, filesize($file));
$title = "/<div class=\"title\">(.*?)<\/div>/ism";
if (preg_match_all($title, $return, $matches)) {
$array["title"] = $matches[1][0];
}
$content = "/<div class=\"arcContent\">(.*?)<\/div>/ism";
if (preg_match_all($content, $return, $matches)) {
$array["countent"] = $matches[1][0];
}
$category = "/class=\"box2.*?labox.*?mr_5.*?f_l\">.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?>/ism";
if (preg_match_all($category, $return, $matches)) {
$array["category"] = $matches[2][0];
$array["parentcategory"] = $matches[3][0];
}
$send = "/class=\"info\">.*?<small>(.*?)<\/div>/ism";
if (preg_match_all($send, $return, $matches)) {
foreach (explode(" ", $matches[1][0]) as $key => $val) {
$text = str_replace("\t", "", $val);
$text = str_replace("\r\n", "", $text);
$small = "/(<\/small>)+(.*)/i";
preg_match_all($small, $text, $matches);
if (! empty($matches[2])) {
switch ($key) {
case 0: //
$array["public_time"] = strtotime($matches[2][0]);
break;
case 1:
$array["laiyuan"] = $matches[2][0];
break;
case 2:
$array["author"] = $matches[2][0];
break;
case 3:
break;
}
}
}
}
$_SERVER["xixi"][] = $array;
fclose($myfile);
}
$conn = @mysql_connect("localhost","root","");
if (!$conn){
die('Could not connect: ' . mysql_error());
}
mysql_select_db("cmysw",$conn);
mysql_query("set names utf8");
$cid;
$pid;
foreach ($_SERVER["xixi"] as $v=>$row){
$mysql= @mysql_query("select * from cx_category where name like '%".$row["category"]."%'");
$mysqlparent= @mysql_query("select * from cx_category where name like '%".$row["parentcategory"]."%'");
while ($val=mysql_fetch_array($mysql)){
global $cid;
$cid=$val["id"];
}
while ($val1=mysql_fetch_array($mysqlparent)){
global $pid;
$pid=$val1["id"];
}
if(!empty($pid)){
$summary=preg_replace("/(\s|\ \;| |\xc2\xa0)/", "", strip_tags($row["countent"]));
$summary= mb_substr($summary,0,50,'utf-8');
if($row["title"]!=""){
$cond=mysql_query("insert into cx_article(cid,title,title_color,author,editor,summary,content,publish_date,create_date,keywords,article_view,is_check,is_pass,create_user)
values('$pid','{$row["title"]}','','{$row["author"]}','{$row["laiyuan"]}','$summary','{$row["countent"]}','{$row["public_time"]}','{$row["public_time"]}','','','1','1','1')",$conn);
}
}
}
?>
相关文章推荐
- 《架构师手记》-151224-为啥要生个“阴阳人”!
- 探索大型网站技术架构(三)
- LVS+Keepalived实现四层负载及高可用
- 架构师于小波:魅族实时消息推送架构
- 写网站的思路启程
- 招聘网站需求分析
- 网站页面引导操作 - intro.js
- 加上快捷键,让你的网站酷起来
- 加上快捷键,让你的网站酷起来
- android View 之Android控件架构
- 域名更换,mycncart及opencart网站搬迁时如何更换config.php
- 腾讯云搜问题解答1——用户为什么在你的网站搜索不到想要的内容
- 20151224今天发现到的两篇关于CSS架构、可复用可维护CSS和CSS学习提升能有改变思想观念意识的文章 分别是CSS架构目标和说说CSS学习中的瓶颈
- 翻译:Linux的电源管理架构
- 微信扫描二维码登录网站技术原理
- Android项目重构之路:架构篇
- Linux ALSA声卡驱动之八:ASoC架构中的Platform
- C# 网站部署IIS常见异常及解决方法
- Linux ALSA声卡驱动之七:ASoC架构中的Codec
- Linux ALSA声卡驱动之六:ASoC架构中的Machine