您的位置:首页 > 其它

我来做百科(第二十天) B

2008-02-19 03:54 99 查看
tag系统完成,再修复一些添加词条,修改内容的问题,就可以做数据采集了。

数据采集网上的资料很多,再结合自己的需要,写了一下代码:


protected void Button1_Click(object sender, EventArgs e)






{


Lemma lemma = new Lemma();




Response.Write("采集结果:<br/><br/>");


Response.Flush();




for (int i = 0; i <= 3; i++)






{




string sUrl = strurl + (i * 10).ToString();




Response.Write("采集url:" + sUrl + "<br/>");


Response.Flush();




foreach (string temp in GetHtmls(@"/view//d+/.htm", GetUrlHtml(sUrl)))






{


string url = u + temp;




string sHtml = GetUrlHtml(url);




string sLemma = GetLemma(sHtml);


string sDetail = GetDetail(sHtml);


string sTag = GetTag(sHtml);




int idLemma = lemma.AddLemma(sLemma, sDetail, "cloud", 0, string.Empty, url, sTag);




StringBuilder sb = new StringBuilder();


sb.Append("id:").Append(idLemma).Append("<br/> 词条:").Append(sLemma).Append("<br/>");


sb.Append("Tag:").Append(sTag).Append("<br/> 连接:<a href='").Append(url).Append("'' target='_blank'>").Append(url).Append("</a><br/>");




if (idLemma > 0)






{


sb.Append("成功!").Append(" <a href='../index/show.aspx?id=").Append(idLemma).Append("' target='_blank'>查看</a>"); ;


}


else






{


sb.Append("失败!错误代码:").Append(idLemma);


}




sb.Append("<br/><br/>");




Response.Write(sb.ToString());


Response.Flush();


}


}


}




public static string GetUrlHtml(string url)






{


string output = "";


Encoding encode = Encoding.Default;


WebClient webclient = new WebClient();


try






{


webclient.Headers.Add("Referer", url);


byte[] buff = webclient.DownloadData(url);


output = encode.GetString(buff);


}


catch






{


}


return output;


}




public static string GetHtml(string begin, string end, string content)






{


return GetHtml(begin + "((.*?//n?)*?)" + end, content);


}




public static string GetHtml(string pattern, string content)






{


Regex reg = new Regex(pattern);


Match match = reg.Match(content);




if (match != Match.Empty)






{


//content = content.Replace(match.Groups[1].ToString(), string.Empty);


return match.Groups[1].ToString();


}


else






{


return string.Empty;


}


}




public static StringCollection GetHtmls(string begin, string end, string content)






{


return GetHtmls(begin + "((.*?//n?)*?)" + end, content);


}




public static StringCollection GetHtmls(string pattern, string content)






{


Regex reg = new Regex(pattern);


MatchCollection matches = reg.Matches(content);


StringCollection list = new StringCollection();


foreach (Match match in matches)






{


if (match != Match.Empty)






{


list.Add(match.Value);


}


}


return list;


}






/**//// <summary>


/// 正则替换


/// </summary>


public static string ReplaceText(string input, string pattern, string replacement)






{


if (string.IsNullOrEmpty(input)) return string.Empty;


Regex rgx = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Multiline);


return rgx.Replace(input, replacement);


}






/**//// <summary>


/// 去标签 包括内容


/// </summary>


public static string ClearWholeTag(string input, string tag)






{


return ReplaceText(input, @"<" + tag + "[^>]*?>.*?</" + tag + ">", "");


}






/**//// <summary>


/// 去标签 不包括内容


/// </summary>


public static string ClearTag(string input, string tag)






{


return ReplaceText(input, @"<//?" + tag + "[^>]*>", "");


}






/**//// <summary>


/// 去全部标签


/// </summary>


public static string ClearAllTag(string input)






{


return ReplaceText(input, @"<//?[a-zA-Z]+[^>]*>", "");


}

数据采集就是爽,先来三百多条吧,哈哈。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: