您的位置:首页 > 编程语言 > PHP开发

PHP 采集常用函数整理

2010-11-02 17:52 567 查看
/*
$content = file_get_contents($url);
$content = iconv("GBK","UTF-8",$content);
print_r($match);
*/
$url = "http://www.phpchina.com/";
$body = fopen_url($url);
preg_match_all('|<div/s*class="news left"/s*id="news">(.*?)</div>|is',$body,$match);

preg_match_all('|<a[^<>]*href=/"([^/"]+)/"/s*title="([^/"]+)"[^<>]*>([^<>]+)<//a>|i', $match[1][0], $matches);
print_r($matches); //或match_links($match[1][0])

/**
*	获取远程文件内容
*	@param $url 文件http地址
*/
function fopen_url($url)
{
if (function_exists('file_get_contents')) {
$file_content = @file_get_contents($url);
} elseif (ini_get('allow_url_fopen') && ($file = @fopen($url, 'rb'))){
$i = 0;
while (!feof($file) && $i++ < 1000) {
$file_content .= strtolower(fread($file, 4096));
}
fclose($file);
} elseif (function_exists('curl_init')) {
$curl_handle = curl_init();
curl_setopt($curl_handle, CURLOPT_URL, $url);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT,2);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER,1);
curl_setopt($curl_handle, CURLOPT_FAILONERROR,1);
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'Trackback Spam Check'); //引用垃圾邮件检查
$file_content = curl_exec($curl_handle);
curl_close($curl_handle);
} else {
$file_content = '';
}
return $file_content;
}
/**
* 提取链接
*Array
(
[link] => Array
(
[0] => http://www.phpchina.com/?action-viewnews-itemid-37454 [1] => http://www.phpchina.com/?action-viewnews-itemid-37465 )
[content] => Array
(
[0] => 微软最顶级平台技术会议PDC10隆重
[1] => 解读微软PDC10要点 看云到端的战略...
)
[all] => Array
(
[0] => <a href="http://www.phpchina.com/?action-viewnews-itemid-37454" mce_href="http://www.phpchina.com/?action-viewnews-itemid-37454">微软最顶级平台技术会议PDC10隆重 </a>
[1] => <a href="http://www.phpchina.com/?action-viewnews-itemid-37465" mce_href="http://www.phpchina.com/?action-viewnews-itemid-37465" title="解读微软PDC10要点 看云到端的战略布
)
)
*/
function match_links($document) {
preg_match_all("'</s*a/s.*?href/s*=/s*([/"/'])?(?(1)(.*?)//1|([^/s/>]+))[^>]*>?(.*?)</a>'isx",$document,$links);
while(list($key,$val) = each($links[2])) {
if(!empty($val))
$match['link'][] = $val;
}
while(list($key,$val) = each($links[3])) {
if(!empty($val))
$match['link'][] = $val;
}
while(list($key,$val) = each($links[4])) {
if(!empty($val))
$match['content'][] = $val;
}
while(list($key,$val) = each($links[0])) {
if(!empty($val))
$match['all'][] = $val;
}
return $match;
}
// ####################### 获取文件流并转换成字符串 #######################
function openfile($url)
{
if(file($url)){
$str = file($url);
$count = count($str);
for ($i=0;$i<$count;$i++){
$file .= $str[$i];

}
return $file;
} else { die("文件打开失败!"); }
}
// ####################### 切分字符串 #######################
function cut($start,$end,$file){
$content=explode($start,$file);
$content=explode($end,$content[1]);
return  $content[0];
}
// ####################### 清除垃圾代码 #######################
function del($start,$end,$content){
$del=cut($start,$end,$content);
$content=str_replace($del,"",$content);
$content=str_replace($start.$end,"",$content);
return $content;
}
// ####################### 分析域名 #######################
function getname($url)
{
$referer = preg_replace("/https?:////([^//]+).*/i", "//1", $url);
$referer = str_replace("www.", "", $referer);
return $referer;
}
// ####################### 清除HTML代码table #######################
function clstable($content)
{
$clscontent= preg_replace("/<table[^>]*?>.*?<//table>/si", "", $content);
return $clscontent;
}
// ####################### 清除HTML代码script #######################
function clsscript($content)
{
$clscontent= preg_replace("/<mce:script[^><!--
]*?>.*?<//script>/si", "", $content);
return $clscontent;
}
// ####################### 清除HTML代码div #######################
function clsdiv($content)
{
$clscontent= preg_replace("/<div[^>]*?>.*?<//div>/si", "", $content);
return $clscontent;
}
// ####################### 清除HTML代码iframe #######################
function clsifr($content)
{
$clscontent= preg_replace("/<IFRAME[^>]*?>.*?<//IFRAME>/si", "", $content);
return $clscontent;
}
// ####################### 清除HTML代码tr,td #######################
function clstrtd($content)
{
$clscontent= preg_replace("/<td[^>]*?>.*?<//td>/si", "", $content);
$clscontent= preg_replace("/<tr[^>]*?>.*?<//tr>/si", "", $clscontent);
$clscontent= preg_replace("/<tr[^>]*?>/si","",$clscontent);
$clscontent= preg_replace("/<td[^>]*?>/si","",$clscontent);
$clscontent= preg_replace("/<//tr>/si","",$clscontent);
$clscontent= preg_replace("/<//td>/si","",$clscontent);
return $clscontent;
}
// ####################### 清除HTML代码超链接 #######################
function clsa($content)
{
$clscontent= preg_replace("/<a[^>]*?>.*?<//a>/si", "", $content);
return $clscontent;
}
// ####################### 彻底清除所有HTML代码#######################
function clearhtml($content)
{
$search = array ("'<script[^>]*?>.*?
// --></mce:script>'si",  // 去掉 javascript
"'<[///!]*?[^<>]*?>'si",           // 去掉 HTML 标记
"'([/r/n])[/s]+'",                 // 去掉空白字符
"'&(quot|#34);'i",                 // 替换 HTML 实体
"'&(amp|#38);'i",
"'&(lt|#60);'i",
"'&(gt|#62);'i",
"'&(nbsp|#160);'i",
"'&(iexcl|#161);'i",
"'&(cent|#162);'i",
"'&(pound|#163);'i",
"'&(copy|#169);'i",
"'&#(/d+);'e");                    // 作为 PHP 代码运行
$replace = array ("",
"",
"//1",
"/"",
"&",
"<",
">",
" ",
chr(161),
chr(162),
chr(163),
chr(169),
"chr(//1)");
$text = preg_replace ($search, $replace, $content);
return $text;
}
// ####################### 写入缓存文件 #######################
function writetocache($cachedir,$cachename, $cachedata = '') {
$cachedir = './'.$cachedir.'/';
$cachefile = $cachedir.$cachename.'.php';
if(!is_dir($cachedir)) {
@mkdir($cachedir, 0777);
}
if(!is_dir($cachedir)) {
@mkdir($cachedir, 0777);
}
if(@$fp = fopen($cachefile, 'wb')) {
@fwrite($fp, $cachedata);
@fclose($fp);
@chmod($cachefile, 0777);
} else {
echo 'Can not write to cache files, please check directory ./cache/ .';
exit;
}
}
// ####################### 获取文件里的html链接 #######################
function geturl($re,$ufile,$rep1,$rep2){
preg_match_all ($re,$ufile,$out, PREG_PATTERN_ORDER);
$result=count($out[1]);
$i=0;
while($i<$result)
{
$outs[$i]=str_replace($rep1,$rep2,$out[1][$i]);
$i++;
}
//合并相同的链接并重新索引...
$reout=array();
$reout=resetar($outs);
return $reout;
}
// ####################### 切分文件流 #######################
function cut($start,$end,$file){
$content=explode($start,$file);
$content=explode($end,$content[1]);
return  $content[0];
}
// ####################### 清除垃圾代码 #######################
function del($start,$end,$content){
$del=cut($start,$end,$content);
$content=str_replace($del,"",$content);
$content=str_replace($start.$end,"",$content);
return $content;
}
// ####################### 清除数组里的重复值并重新索引数组 #######################
function resetar($outs){
$reout=array();
$reouts=array();
$reout=array_unique($outs);
foreach($reout as $key=>$value){
array_push($reouts,$value);
}
return $reouts;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: