PHP 采集常用函数整理
2010-11-02 17:52
567 查看
/* $content = file_get_contents($url); $content = iconv("GBK","UTF-8",$content); print_r($match); */ $url = "http://www.phpchina.com/"; $body = fopen_url($url); preg_match_all('|<div/s*class="news left"/s*id="news">(.*?)</div>|is',$body,$match); preg_match_all('|<a[^<>]*href=/"([^/"]+)/"/s*title="([^/"]+)"[^<>]*>([^<>]+)<//a>|i', $match[1][0], $matches); print_r($matches); //或match_links($match[1][0]) /** * 获取远程文件内容 * @param $url 文件http地址 */ function fopen_url($url) { if (function_exists('file_get_contents')) { $file_content = @file_get_contents($url); } elseif (ini_get('allow_url_fopen') && ($file = @fopen($url, 'rb'))){ $i = 0; while (!feof($file) && $i++ < 1000) { $file_content .= strtolower(fread($file, 4096)); } fclose($file); } elseif (function_exists('curl_init')) { $curl_handle = curl_init(); curl_setopt($curl_handle, CURLOPT_URL, $url); curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT,2); curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER,1); curl_setopt($curl_handle, CURLOPT_FAILONERROR,1); curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Trackback Spam Check'); //引用垃圾邮件检查 $file_content = curl_exec($curl_handle); curl_close($curl_handle); } else { $file_content = ''; } return $file_content; } /** * 提取链接 *Array ( [link] => Array ( [0] => http://www.phpchina.com/?action-viewnews-itemid-37454 [1] => http://www.phpchina.com/?action-viewnews-itemid-37465 ) [content] => Array ( [0] => 微软最顶级平台技术会议PDC10隆重 [1] => 解读微软PDC10要点 看云到端的战略... ) [all] => Array ( [0] => <a href="http://www.phpchina.com/?action-viewnews-itemid-37454" mce_href="http://www.phpchina.com/?action-viewnews-itemid-37454">微软最顶级平台技术会议PDC10隆重 </a> [1] => <a href="http://www.phpchina.com/?action-viewnews-itemid-37465" mce_href="http://www.phpchina.com/?action-viewnews-itemid-37465" title="解读微软PDC10要点 看云到端的战略布 ) ) */ function match_links($document) { preg_match_all("'</s*a/s.*?href/s*=/s*([/"/'])?(?(1)(.*?)//1|([^/s/>]+))[^>]*>?(.*?)</a>'isx",$document,$links); while(list($key,$val) = each($links[2])) { if(!empty($val)) $match['link'][] = $val; } while(list($key,$val) = each($links[3])) { if(!empty($val)) $match['link'][] = $val; } while(list($key,$val) = each($links[4])) { if(!empty($val)) $match['content'][] = $val; } while(list($key,$val) = each($links[0])) { if(!empty($val)) $match['all'][] = $val; } return $match; } // ####################### 获取文件流并转换成字符串 ####################### function openfile($url) { if(file($url)){ $str = file($url); $count = count($str); for ($i=0;$i<$count;$i++){ $file .= $str[$i]; } return $file; } else { die("文件打开失败!"); } } // ####################### 切分字符串 ####################### function cut($start,$end,$file){ $content=explode($start,$file); $content=explode($end,$content[1]); return $content[0]; } // ####################### 清除垃圾代码 ####################### function del($start,$end,$content){ $del=cut($start,$end,$content); $content=str_replace($del,"",$content); $content=str_replace($start.$end,"",$content); return $content; } // ####################### 分析域名 ####################### function getname($url) { $referer = preg_replace("/https?:////([^//]+).*/i", "//1", $url); $referer = str_replace("www.", "", $referer); return $referer; } // ####################### 清除HTML代码table ####################### function clstable($content) { $clscontent= preg_replace("/<table[^>]*?>.*?<//table>/si", "", $content); return $clscontent; } // ####################### 清除HTML代码script ####################### function clsscript($content) { $clscontent= preg_replace("/<mce:script[^><!-- ]*?>.*?<//script>/si", "", $content); return $clscontent; } // ####################### 清除HTML代码div ####################### function clsdiv($content) { $clscontent= preg_replace("/<div[^>]*?>.*?<//div>/si", "", $content); return $clscontent; } // ####################### 清除HTML代码iframe ####################### function clsifr($content) { $clscontent= preg_replace("/<IFRAME[^>]*?>.*?<//IFRAME>/si", "", $content); return $clscontent; } // ####################### 清除HTML代码tr,td ####################### function clstrtd($content) { $clscontent= preg_replace("/<td[^>]*?>.*?<//td>/si", "", $content); $clscontent= preg_replace("/<tr[^>]*?>.*?<//tr>/si", "", $clscontent); $clscontent= preg_replace("/<tr[^>]*?>/si","",$clscontent); $clscontent= preg_replace("/<td[^>]*?>/si","",$clscontent); $clscontent= preg_replace("/<//tr>/si","",$clscontent); $clscontent= preg_replace("/<//td>/si","",$clscontent); return $clscontent; } // ####################### 清除HTML代码超链接 ####################### function clsa($content) { $clscontent= preg_replace("/<a[^>]*?>.*?<//a>/si", "", $content); return $clscontent; } // ####################### 彻底清除所有HTML代码####################### function clearhtml($content) { $search = array ("'<script[^>]*?>.*? // --></mce:script>'si", // 去掉 javascript "'<[///!]*?[^<>]*?>'si", // 去掉 HTML 标记 "'([/r/n])[/s]+'", // 去掉空白字符 "'&(quot|#34);'i", // 替换 HTML 实体 "'&(amp|#38);'i", "'&(lt|#60);'i", "'&(gt|#62);'i", "'&(nbsp|#160);'i", "'&(iexcl|#161);'i", "'&(cent|#162);'i", "'&(pound|#163);'i", "'&(copy|#169);'i", "'(/d+);'e"); // 作为 PHP 代码运行 $replace = array ("", "", "//1", "/"", "&", "<", ">", " ", chr(161), chr(162), chr(163), chr(169), "chr(//1)"); $text = preg_replace ($search, $replace, $content); return $text; } // ####################### 写入缓存文件 ####################### function writetocache($cachedir,$cachename, $cachedata = '') { $cachedir = './'.$cachedir.'/'; $cachefile = $cachedir.$cachename.'.php'; if(!is_dir($cachedir)) { @mkdir($cachedir, 0777); } if(!is_dir($cachedir)) { @mkdir($cachedir, 0777); } if(@$fp = fopen($cachefile, 'wb')) { @fwrite($fp, $cachedata); @fclose($fp); @chmod($cachefile, 0777); } else { echo 'Can not write to cache files, please check directory ./cache/ .'; exit; } } // ####################### 获取文件里的html链接 ####################### function geturl($re,$ufile,$rep1,$rep2){ preg_match_all ($re,$ufile,$out, PREG_PATTERN_ORDER); $result=count($out[1]); $i=0; while($i<$result) { $outs[$i]=str_replace($rep1,$rep2,$out[1][$i]); $i++; } //合并相同的链接并重新索引... $reout=array(); $reout=resetar($outs); return $reout; } // ####################### 切分文件流 ####################### function cut($start,$end,$file){ $content=explode($start,$file); $content=explode($end,$content[1]); return $content[0]; } // ####################### 清除垃圾代码 ####################### function del($start,$end,$content){ $del=cut($start,$end,$content); $content=str_replace($del,"",$content); $content=str_replace($start.$end,"",$content); return $content; } // ####################### 清除数组里的重复值并重新索引数组 ####################### function resetar($outs){ $reout=array(); $reouts=array(); $reout=array_unique($outs); foreach($reout as $key=>$value){ array_push($reouts,$value); } return $reouts; }
相关文章推荐