您的位置:首页 > 运维架构 > 网站架构

网站信息采集

2015-12-25 10:19 621 查看
<?php

/**

* @MT制作的爬虫

*/

header("Content-type: text/html; charset=utf-8");

$dir = "html"; // 要获取的目录

$num = 0;

movePath($dir);

function movePath($dir)

{

if (is_dir($dir)) {

if ($dh = opendir($dir)) {

while (($file = readdir($dh)) != false) {

if ($file != "." && $file != "..") {

if (strpos($file, ".html")) {

if (is_numeric(substr($file, 0, strpos($file, ".html")))) {

fillRead($dir . "/" . $file);

}

} else {

movePath($dir . "/" . $file);

}

}

}

closedir($dh);

}

}

}

/*

* $conn = mysql_connect("localhost","root","");

* if (!$conn){

* die('Could not connect: ' . mysql_error());

* }

*/

function fillRead($file)

{

$array = array();

$myfile = fopen($file, "r") or die("Unable to open file!");

$return = fread($myfile, filesize($file));

$title = "/<div class=\"title\">(.*?)<\/div>/ism";

if (preg_match_all($title, $return, $matches)) {

$array["title"] = $matches[1][0];

}

$content = "/<div class=\"arcContent\">(.*?)<\/div>/ism";

if (preg_match_all($content, $return, $matches)) {

$array["countent"] = $matches[1][0];

}

$category = "/class=\"box2.*?labox.*?mr_5.*?f_l\">.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?>/ism";

if (preg_match_all($category, $return, $matches)) {

$array["category"] = $matches[2][0];

$array["parentcategory"] = $matches[3][0];

}

$send = "/class=\"info\">.*?<small>(.*?)<\/div>/ism";

if (preg_match_all($send, $return, $matches)) {

foreach (explode("  ", $matches[1][0]) as $key => $val) {

$text = str_replace("\t", "", $val);

$text = str_replace("\r\n", "", $text);

$small = "/(<\/small>)+(.*)/i";

preg_match_all($small, $text, $matches);

if (! empty($matches[2])) {

switch ($key) {

case 0: //

$array["public_time"] = strtotime($matches[2][0]);

break;

case 1:

$array["laiyuan"] = $matches[2][0];

break;

case 2:

$array["author"] = $matches[2][0];

break;

case 3:

break;

}

}

}

}

$_SERVER["xixi"][] = $array;

fclose($myfile);

}

$conn = @mysql_connect("localhost","root","");

if (!$conn){

die('Could not connect: ' . mysql_error());

}

mysql_select_db("cmysw",$conn);

mysql_query("set names utf8");

$cid;

$pid;

foreach ($_SERVER["xixi"] as $v=>$row){

$mysql= @mysql_query("select * from cx_category where name like '%".$row["category"]."%'");

$mysqlparent= @mysql_query("select * from cx_category where name like '%".$row["parentcategory"]."%'");

while ($val=mysql_fetch_array($mysql)){

global $cid;

$cid=$val["id"];

}

while ($val1=mysql_fetch_array($mysqlparent)){

global $pid;

$pid=$val1["id"];

}

if(!empty($pid)){

$summary=preg_replace("/(\s|\ \;| |\xc2\xa0)/", "", strip_tags($row["countent"]));

$summary= mb_substr($summary,0,50,'utf-8');

if($row["title"]!=""){

$cond=mysql_query("insert into cx_article(cid,title,title_color,author,editor,summary,content,publish_date,create_date,keywords,article_view,is_check,is_pass,create_user)

values('$pid','{$row["title"]}','','{$row["author"]}','{$row["laiyuan"]}','$summary','{$row["countent"]}','{$row["public_time"]}','{$row["public_time"]}','','','1','1','1')",$conn);

}

}

}

?>
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: